1 /*  tofasta.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name:  tofasta.c
27 *
28 * Author:  James Ostell
29 *
30 * Version Creation Date: 7/12/91
31 *
32 * $Revision: 6.313 $
33 *
34 * File Description:  various sequence objects to fasta output
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44 #include <tofasta.h>
45 #include <gather.h>
46 #include <sqnutils.h>  /* MakeSeqID */
47 #include <subutil.h>   /* MOLECULE_TYPE_GENOMIC */
48 #include <explore.h>
49 #include <objloc.h>
50 #include <objfdef.h>
51 #include <asn2gnbi.h>
52 #include <objmacro.h>
53 #include <macroapi.h>
54 
55 #ifdef OS_UNIX_DARWIN
56 #define NLM_GETC fgetc
57 #else
58 #define NLM_GETC getc
59 #endif
60 #define SeqLocNew(_a) ValNodeNew((_a))
61 
62 static Uint1 na_order[NUM_SEQID] = {   /* order of nucleic acid deflines */
63     255, /* 0 = not set */
64     230, /* 1 = local Object-id */
65     30,  /* 2 = gibbsq */
66     30,  /* 3 = gibbmt */
67     255, /* 4 = giim Giimport-id */
68     20, /* 5 = genbank */
69     20, /* 6 = embl */
70     255, /* 7 = pir */
71     255, /* 8 = swissprot */
72     40,  /* 9 = patent */
73     15, /* 10 = other TextSeqId (RefGene) */
74     50, /* 11 = general Dbtag */
75     120,  /* 12 = gi */
76     20, /* 13 = ddbj */
77     255, /* 14 = prf */
78     30, /* 15 = pdb */
79     20,  /* 16 = tpg */
80     20,  /* 17 = tpe */
81     20,  /* 18 = tpd */
82     20,  /* 19 = gpp */
83     20   /* 30 = nat */
84     };
85 static Uint1 aa_order[NUM_SEQID] = {   /* order of nucleic acid deflines */
86     255, /* 0 = not set */
87     230, /* 1 = local Object-id */
88     40,  /* 2 = gibbsq */
89     40,  /* 3 = gibbmt */
90     255, /* 4 = giim Giimport-id */
91     60, /* 5 = genbank */
92     60, /* 6 = embl */
93     30, /* 7 = pir */
94     20, /* 8 = swissprot */
95     80,  /* 9 = patent */
96     15, /* 10 = other TextSeqId (RefGene) */
97     90, /* 11 = general Dbtag */
98     120,  /* 12 = gi */
99     60, /* 13 = ddbj */
100     70, /* 14 = prf */
101     50, /* 15 = pdb */
102     60,  /* 16 = tpg */
103     60,  /* 17 = tpe */
104     60,  /* 18 = tpd */
105     60,  /* 19 = gpp */
106     60   /* 20 = nat */
107     };
108 #define FASTA_BUFFER_LEN 524288
109 #define PATENT_ORDER 110         /* order for any patent */
110 /*****************************************************************************
111 *
112 *   The above sets the ordering to be, lowest to highest
113 *
114 Nucleic Acids:
115     GenBank/EMBL/DDBJ
116     PDB
117     Patents
118     Anything else
119 Proteins:
120     SWISSPROT
121     PIR
122     NCBI BackBone (but not in GenBank)
123     PDB
124     GenBank/EMBL/DDBJ translations
125     PRF
126     Patents
127     Anything else
128 *
129 *****************************************************************************/
GetOrderBySeqId(Int4 choice,Boolean is_prot)130 Int4 GetOrderBySeqId(Int4 choice, Boolean is_prot)
131 {
132     if(choice > NUM_SEQID)
133         return -1;
134     if(is_prot)
135         return aa_order[choice];
136     else
137         return na_order[choice];
138 }
139 /*****************************************************************************
140 *
141 *   Traversal routine for SeqEntryToFasta
142 *
143 *****************************************************************************/
SeqEntryFasta(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)144 void SeqEntryFasta (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
145 {
146   FastaPtr tfa;
147   BioseqPtr bsp = NULL;
148   BioseqSetPtr bssp = NULL;
149   MyFsaPtr mfp;
150   Boolean is_na;
151   SeqIdPtr sip;
152   TextSeqIdPtr tsip;
153   ValNodePtr vnp;
154   OrgRefPtr orp;
155   MolInfoPtr mip;
156   tfa = (FastaPtr) data;
157   mfp = tfa->mfp;
158   if (tfa->group_segs == 2)       /* put out only segments */
159     {
160       if (tfa->parts != -1)      /* in parts set */
161         {
162           if (indent <= tfa->parts)   /* out of parts set */
163             {
164               tfa->parts = -1;
165               tfa->seg = -1;
166             }
167         }
168     }
169   if (IS_Bioseq(sep))
170     {
171       bsp = (BioseqPtr)(sep->data.ptrvalue);
172       vnp = bsp->descr;
173     }
174   else
175     {
176       bssp = (BioseqSetPtr)(sep->data.ptrvalue);
177       vnp = bssp->descr;
178     }
179   orp = NULL;
180   mip = NULL;
181   while (vnp != NULL)   /* check for organism info */
182     {
183       switch (vnp->choice)
184         {
185         case Seq_descr_source:
186           orp = ((BioSourcePtr)(vnp->data.ptrvalue))->org;
187           break;
188         case Seq_descr_org:
189           orp = (OrgRefPtr)(vnp->data.ptrvalue);
190           break;
191         case Seq_descr_molinfo:
192           mip = (MolInfoPtr)(vnp->data.ptrvalue);
193           break;
194         default:
195           break;
196         }
197       vnp = vnp->next;
198     }
199   if (orp != NULL)
200     {
201       if (orp->taxname != NULL)
202         mfp->organism = orp->taxname;
203       else if (orp->common != NULL)
204         mfp->organism = orp->common;
205     }
206   if (mip != NULL)
207     mfp->tech = mip->tech;
208   else
209     mfp->tech = 0 ;
210   if (! IS_Bioseq(sep))    /* check for taking only parts of seg seqs */
211     {
212       if (tfa->group_segs == 2)    /* put out only segments */
213         {
214           if (bssp->_class == 2)   /* segset */
215             tfa->seg = indent;
216           else if (bssp->_class == 4)   /* parts */
217             {
218               if ((tfa->seg >= 0) && (tfa->seg < indent))
219                 {
220                   tfa->parts = indent;   /* in parts set */
221                 }
222             }
223         }
224       return;
225     }
226   is_na = tfa->is_na;
227   if ((! is_na) && (! ISA_aa(bsp->mol))) /* check for translations */
228     {
229       for (sip = bsp->id; sip != NULL; sip = sip->next)
230         {
231           switch (sip->choice)
232             {
233             case SEQID_GENBANK:
234             case SEQID_EMBL:
235             case SEQID_DDBJ:
236             case SEQID_OTHER:
237             case SEQID_TPG:
238             case SEQID_TPE:
239             case SEQID_TPD:
240             case SEQID_GPIPE:
241               tsip = (TextSeqIdPtr)(sip->data.ptrvalue);
242               if (tsip->accession != NULL)
243                 mfp->accession = tsip->accession;
244               break;
245             default:
246               break;
247             }
248         }
249     }
250   if (tfa->last_indent != -1)   /* putting out segments together */
251     {
252       if (indent > tfa->last_indent)
253         return;
254       tfa->last_indent = -1;
255     }
256   /* do raw bioseqs only */
257   if (! tfa->group_segs)
258     {
259       if (BioseqRawToFastaX(bsp, mfp, is_na))
260         tfa->got_one = TRUE;
261     }
262   else if (tfa->group_segs == 1)    /* do segmented sets */
263     {
264       if (BioseqToFastaX(bsp, mfp, is_na))
265         {
266           tfa->got_one = TRUE;
267           if (bsp->repr == Seq_repr_seg)
268             tfa->last_indent = indent;
269         }
270     }
271   else if (tfa->group_segs == 2)    /* take only the parts */
272     {
273       if (tfa->parts >= 0)    /* in segmented parts set */
274         {
275           if (BioseqRawToFastaX(bsp, mfp, is_na))
276             tfa->got_one = TRUE;
277         }
278     }
279   return;
280 }
281 /*****************************************************************************
282 *
283 *   SeqEntryToFasta(sep, fp, is_na)
284 *
285 *****************************************************************************/
SeqEntryToFasta(SeqEntryPtr sep,FILE * fp,Boolean is_na)286 NLM_EXTERN Boolean SeqEntryToFasta (SeqEntryPtr sep, FILE *fp, Boolean is_na)
287 {
288     if (IS_Bioseq(sep))
289         return SeqEntrysToFasta(sep, fp, is_na, 3);
290     else
291         return SeqEntrysToFasta(sep, fp, is_na, 0);
292 }
293 static Boolean SeqEntrysToFastaXX (SeqEntryPtr sep, FILE *fp, Boolean is_na, Uint1 group_segs, Boolean printid_general);
SeqEntryToFastaEx(SeqEntryPtr sep,FILE * fp,Boolean is_na,Boolean printid_general)294 NLM_EXTERN Boolean SeqEntryToFastaEx (SeqEntryPtr sep, FILE *fp, Boolean is_na, Boolean printid_general)
295 {
296     if (IS_Bioseq(sep))
297         return SeqEntrysToFastaXX(sep, fp, is_na, 3, printid_general);
298     else
299         return SeqEntrysToFastaXX(sep, fp, is_na, 0, printid_general);
300 }
301 /*****************************************************************************
302 *
303 *   FastaFileFunc(key, buf, data)
304 *       standard "write to file" callback
305 *
306 *****************************************************************************/
FastaFileFunc(BioseqPtr bsp,Int2 key,CharPtr buf,Uint4 buflen,Pointer data)307 NLM_EXTERN Boolean FastaFileFunc (BioseqPtr bsp, Int2 key, CharPtr buf,
308                                   Uint4 buflen, Pointer data)
309 {
310     FILE * fp;
311     fp = (FILE *)data;
312     switch (key)
313     {
314         case FASTA_ID:
315             fprintf(fp, ">%s ", buf);
316             break;
317         case FASTA_DEFLINE:
318             fprintf(fp, "%s\n", buf);
319             break;
320         case FASTA_SEQLINE:
321             fprintf(fp, "%s\n", buf);
322             break;
323         case FASTA_EOS:   /* end of sequence */
324             break;
325         default:
326             break;
327     }
328     return TRUE;
329 }
330 /*****************************************************************************
331 *
332 *   FastaFileFunc(key, buf, data)
333 *       standard "write to file" callback
334 *
335 *    Used for BLAST (FASTA) databases.  If the defline is
336 *    longer than buflen, then check that an ID is not
337 *    truncated in the middle.
338 *
339 *****************************************************************************/
FastaDumpFileFunc(BioseqPtr bsp,Int2 key,CharPtr buf,Uint4 buflen,Pointer data)340 NLM_EXTERN Boolean FastaDumpFileFunc (BioseqPtr bsp, Int2 key, CharPtr buf,
341                                   Uint4 buflen, Pointer data)
342 {
343     FILE * fp;
344     fp = (FILE *)data;
345     switch (key)
346     {
347         case FASTA_ID:
348             fprintf(fp, ">%s ", buf);
349             break;
350         case FASTA_DEFLINE:
351             if (buflen >= FASTA_BUFFER_LEN-1)
352             {
353                 Uint4 index=buflen;
354                 while (index > 0 && buf[index] != ' ')
355                 {
356                     if (buf[index] == '\001')
357                     {
358                         buf[index] = NULLB;
359                         break;
360                     }
361                     index--;
362                 }
363             }
364             fprintf(fp, "%s\n", buf);
365             break;
366         case FASTA_SEQLINE:
367             fprintf(fp, "%s\n", buf);
368             break;
369         case FASTA_EOS:   /* end of sequence */
370             break;
371         default:
372             break;
373     }
374     return TRUE;
375 }
376 /*****************************************************************************
377 *
378 *   SeqEntrysToFasta(sep, fp, is_na, group_segs)
379 *
380 *       group_segs = 0 ... take only raw Bioseqs
381 *       group_segs = 1 ... group segmented seqs into single entry.. no parts
382 *       group_segs = 2 ... show only parts of segmented seqs
383 *       group_segs = 3 ... like 1, but instantiate virtual Bioseqs
384 *
385 *****************************************************************************/
SeqEntrysToFastaXX(SeqEntryPtr sep,FILE * fp,Boolean is_na,Uint1 group_segs,Boolean printid_general)386 static Boolean SeqEntrysToFastaXX (SeqEntryPtr sep, FILE *fp, Boolean is_na, Uint1 group_segs, Boolean printid_general)
387 {
388     FastaDat tfa;
389     MyFsa mfa;
390     Char buf[FASTA_BUFFER_LEN+1];
391     if ((sep == NULL) || (fp == NULL))
392         return FALSE;
393     MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
394     mfa.buf = buf;
395     mfa.buflen = FASTA_BUFFER_LEN;
396     mfa.seqlen = 70;
397     mfa.mydata = (Pointer)fp;
398     mfa.myfunc = FastaFileFunc;
399     mfa.bad_asn1 = FALSE;
400     mfa.order = 0;
401     mfa.accession = NULL;
402     mfa.organism = NULL;
403     mfa.do_virtual = FALSE;
404     mfa.tech = 0;
405     mfa.no_sequence = FALSE;
406     mfa.formatdb    = FALSE;
407     mfa.printid_general = printid_general;
408     mfa.seqloc = NULL;
409     tfa.mfp = &mfa;
410     tfa.is_na = is_na;
411     if (is_na)
412         mfa.code = Seq_code_iupacna;
413     else
414         mfa.code = Seq_code_ncbieaa;
415     if (group_segs == 3)  /* do 2 things */
416     {
417         mfa.do_virtual = TRUE;
418         group_segs = 1;
419     }
420     tfa.group_segs = group_segs;
421     tfa.last_indent = -1;
422     tfa.parts = -1;
423     tfa.seg = -1;
424     tfa.got_one = FALSE;
425     SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryFasta);
426     return tfa.got_one;
427 }
SeqEntrysToFasta(SeqEntryPtr sep,FILE * fp,Boolean is_na,Uint1 group_segs)428 NLM_EXTERN Boolean SeqEntrysToFasta (SeqEntryPtr sep, FILE *fp, Boolean is_na, Uint1 group_segs)
429 {
430     return SeqEntrysToFastaXX (sep, fp, is_na, group_segs, FALSE);
431 }
432 /*****************************************************************************
433 *
434 *   SeqEntrysToFastaX(sep, mfa, is_na, group_segs)
435 *
436 *****************************************************************************/
SeqEntrysToFastaX(SeqEntryPtr sep,MyFsaPtr mfp,Boolean is_na,Uint1 group_segs)437 NLM_EXTERN Boolean SeqEntrysToFastaX (SeqEntryPtr sep, MyFsaPtr mfp, Boolean is_na, Uint1 group_segs)
438 {
439     FastaDat tfa;
440     if ((sep == NULL) || (mfp == NULL))
441         return FALSE;
442     tfa.mfp = mfp;
443     tfa.is_na = is_na;
444     if (group_segs == 3)  /* do 2 things */
445     {
446         mfp->do_virtual = TRUE;
447         group_segs = 1;
448     }
449     tfa.group_segs = group_segs;
450     tfa.last_indent = -1;
451     tfa.parts = -1;
452     tfa.seg = -1;
453     tfa.got_one = FALSE;
454         SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryFasta);
455     return tfa.got_one;
456 }
457 /*****************************************************************************
458 *
459 *   SeqEntrysToDefline(sep, mfa, is_na, group_segs)
460 *
461 *****************************************************************************/
462 #define DEFLINE_MAX_LEN FASTA_BUFFER_LEN
SeqEntrysToDefline(SeqEntryPtr sep,FILE * fp,Boolean is_na,Uint1 group_segs)463 NLM_EXTERN Boolean SeqEntrysToDefline(SeqEntryPtr sep,
464                            FILE *fp, Boolean is_na, Uint1 group_segs)
465 {
466   FastaDat tfa;
467   MyFsa mfa;
468   if ((sep == NULL) || (fp == NULL))
469     return FALSE;
470   MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
471   mfa.buf = (CharPtr) MemNew(DEFLINE_MAX_LEN);
472   mfa.buflen = DEFLINE_MAX_LEN-1;
473   mfa.seqlen = DEFLINE_MAX_LEN;
474   mfa.mydata = (Pointer)fp;
475   mfa.myfunc = FastaFileFunc;
476   mfa.no_sequence = TRUE;
477   mfa.bad_asn1 = FALSE;
478   mfa.order = 0;
479   mfa.accession = NULL;
480   mfa.organism = NULL;
481   mfa.do_virtual = FALSE;
482   mfa.formatdb = FALSE;
483   mfa.tech = 0;
484   mfa.printid_general = FALSE;
485   mfa.seqloc = NULL;
486   tfa.mfp = &mfa;
487   tfa.is_na = is_na;
488   if (group_segs == 3)  /* do 2 things */
489     {
490       mfa.do_virtual = TRUE;
491       group_segs = 1;
492     }
493   tfa.group_segs = group_segs;
494   tfa.last_indent = -1;
495   tfa.parts = -1;
496   tfa.seg = -1;
497   tfa.got_one = FALSE;
498   SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryFasta);
499   MemFree(mfa.buf);
500   return tfa.got_one;
501 }
502 /*****************************************************************************
503 *
504 *   Boolean BioseqRawToFasta(bsp, fp, is_na)
505 *
506 *****************************************************************************/
BioseqRawToFasta(BioseqPtr bsp,FILE * fp,Boolean is_na)507 NLM_EXTERN Boolean BioseqRawToFasta (BioseqPtr bsp, FILE *fp, Boolean is_na)
508 {
509     return BioseqRawToFastaExtra(bsp, fp, 80);
510 }
BioseqRawToFastaExtra(BioseqPtr bsp,FILE * fp,Int2 line_length)511 NLM_EXTERN Boolean BioseqRawToFastaExtra (BioseqPtr bsp, FILE *fp, Int2 line_length)
512 {
513      return BioseqRawToFastaExtraEx (bsp, fp, line_length, NULL);
514 }
BioseqRawToFastaExtraEx(BioseqPtr bsp,FILE * fp,Int2 line_length,SeqLocPtr slp)515 NLM_EXTERN Boolean BioseqRawToFastaExtraEx(BioseqPtr bsp, FILE *fp, Int2 line_length, SeqLocPtr slp)
516 {
517     MyFsa mfa;
518     Char buf[FASTA_BUFFER_LEN+1];
519     if ((bsp == NULL) || (fp == NULL))
520         return FALSE;
521     MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
522     mfa.buf = buf;
523     mfa.buflen = FASTA_BUFFER_LEN;
524     mfa.seqlen = line_length;
525     mfa.mydata = (Pointer)fp;
526     mfa.myfunc = FastaFileFunc;
527     mfa.bad_asn1 = FALSE;
528     mfa.order = 0;
529     mfa.accession = NULL;
530     mfa.organism = NULL;
531     mfa.do_virtual = FALSE;
532     mfa.tech = 0;
533     mfa.no_sequence = FALSE;
534     mfa.formatdb = FALSE;
535     mfa.printid_general = FALSE;
536     mfa.seqloc = slp;
537      return BioseqRawToFastaX(bsp, &mfa, ISA_na(bsp->mol));
538 }
539 /*****************************************************************************
540 *
541 *   Boolean BioseqRawToFastaX(bsp, mfp, is_na)
542 *
543 *****************************************************************************/
BioseqRawToFastaX(BioseqPtr bsp,MyFsaPtr mfp,Boolean is_na)544 NLM_EXTERN Boolean BioseqRawToFastaX (BioseqPtr bsp, MyFsaPtr mfp, Boolean is_na)
545 {
546     Uint1 repr;
547     if ((bsp == NULL) || (mfp == NULL))
548         return FALSE;
549     repr = Bioseq_repr(bsp);
550     if (! ((repr == Seq_repr_raw) || (repr == Seq_repr_const)))
551         return FALSE;
552     return BioseqToFastaX(bsp, mfp, is_na);
553 }
554 /*****************************************************************************
555 *
556 *   Boolean BioseqToFasta(bsp, fp, is_na)
557 *
558 *****************************************************************************/
BioseqToFasta(BioseqPtr bsp,FILE * fp,Boolean is_na)559 NLM_EXTERN Boolean BioseqToFasta (BioseqPtr bsp, FILE *fp, Boolean is_na)
560 {
561     MyFsa mfa;
562     Char buf[FASTA_BUFFER_LEN+1];
563     if ((bsp == NULL) || (fp == NULL))
564         return FALSE;
565     MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
566     mfa.buf = buf;
567     mfa.buflen = FASTA_BUFFER_LEN;
568     mfa.seqlen = 80;
569     mfa.mydata = (Pointer)fp;
570     mfa.myfunc = FastaFileFunc;
571     mfa.bad_asn1 = FALSE;
572     mfa.order = 0;
573     mfa.accession = NULL;
574     mfa.organism = NULL;
575     mfa.do_virtual = FALSE;
576     mfa.tech = 0;
577     mfa.no_sequence = FALSE;
578     mfa.formatdb = FALSE;
579     mfa.printid_general = FALSE;
580     mfa.seqloc = NULL;
581     return BioseqToFastaX(bsp, &mfa, is_na);
582 }
583 /*****************************************************************************
584 *
585 *   Boolean BioseqToFastaDump(bsp, fp, is_na)
586 *
587 *****************************************************************************/
BioseqToFastaDump(BioseqPtr bsp,FILE * fp,Boolean is_na)588 NLM_EXTERN Boolean BioseqToFastaDump (BioseqPtr bsp, FILE *fp, Boolean is_na)
589 {
590     MyFsa mfa;
591     Char buf[FASTA_BUFFER_LEN+1];
592     if ((bsp == NULL) || (fp == NULL))
593         return FALSE;
594     MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
595     mfa.buf = buf;
596     mfa.buflen = FASTA_BUFFER_LEN;
597     mfa.seqlen = 80;
598     mfa.mydata = (Pointer)fp;
599     mfa.myfunc = FastaDumpFileFunc;
600     mfa.bad_asn1 = FALSE;
601     mfa.order = 0;
602     mfa.accession = NULL;
603     mfa.organism = NULL;
604     mfa.do_virtual = FALSE;
605     mfa.tech = 0;
606     mfa.no_sequence = FALSE;
607     mfa.formatdb = FALSE;
608     mfa.printid_general = FALSE;
609     mfa.seqloc = NULL;
610     return BioseqToFastaX(bsp, &mfa, is_na);
611 }
612 /*****************************************************************************
613 *
614 *   Boolean BioseqToFastaX(bsp, mfp, is_na)
615 *
616 *****************************************************************************/
617 static Boolean FastaIdX(BioseqPtr bsp, CharPtr buf, Uint4 buflen, Boolean printid_general, SeqLocPtr seqloc);
BioseqToFastaX(BioseqPtr bsp,MyFsaPtr mfp,Boolean is_na)618 NLM_EXTERN Boolean BioseqToFastaX (BioseqPtr bsp, MyFsaPtr mfp, Boolean is_na)
619 {
620     SeqPortPtr spp;
621     Uint1 repr, code;
622     Char buf[41];
623     SeqIdPtr sip;
624     Uint1 order = 255;
625     Boolean is_patent = FALSE, is_genbank = FALSE;
626     Uint1Ptr order_array;
627     int i;
628     CharPtr organism = NULL;
629     if ((bsp == NULL) || (mfp == NULL))
630         return FALSE;
631     repr = Bioseq_repr(bsp);
632     if (ISA_na(bsp->mol))
633     {
634         if (! is_na)
635             return FALSE;
636         order_array = na_order;
637     }
638     else if (ISA_aa(bsp->mol))
639     {
640         if (is_na)
641             return FALSE;
642         order_array = aa_order;
643         if (mfp->accession != NULL)           /* translated genbank */
644         {
645             order = order_array[SEQID_GENBANK];
646             is_genbank = TRUE;
647             organism = mfp->organism;
648         }
649     }
650     else
651     {
652         buf[0] = '\0';
653           SeqIdWrite(SeqIdFindBest(bsp->id, 0), buf, PRINTID_FASTA_LONG, 40);
654           ErrPostEx(SEV_ERROR,0,0,"ToFasta: [%s] Unrecognized bsp->mol = %d",
655           buf, (int)(bsp->mol));
656     mfp->bad_asn1 = TRUE;
657     return FALSE;
658     }
659     mfp->bsp = bsp;
660     for (sip = bsp->id; sip != NULL; sip = sip->next)
661     {
662         i=(int)(sip->choice);
663         if (! is_genbank)    /* don't change order for translated genbank */
664         {
665             if (order_array[i] < order)
666                 order = order_array[i];
667         }
668         if (i == (int)SEQID_PATENT)
669             is_patent = TRUE;
670         else if (i == (int)SEQID_PRF)
671             organism = mfp->organism;
672     }
673     if (is_patent)
674         order = PATENT_ORDER;
675     mfp->order = order;
676     switch (mfp->tech)
677     {
678         case MI_TECH_est:
679         case MI_TECH_sts:
680         case MI_TECH_survey:
681         case MI_TECH_htgs_1:
682         case MI_TECH_htgs_2:
683         case MI_TECH_htgs_3:
684             organism = mfp->organism;
685             break;
686         default:
687             break;
688     }
689     if (! FastaIdX(bsp, mfp->buf, mfp->buflen, mfp->printid_general, mfp->seqloc))
690         return FALSE;
691     (*(mfp->myfunc))(bsp, FASTA_ID, mfp->buf, StringLen(mfp->buf), mfp->mydata);
692        if (! CreateDefLine(NULL, bsp, mfp->buf, mfp->buflen, mfp->tech, mfp->accession, organism))
693            return FALSE;
694     (*(mfp->myfunc))(bsp, FASTA_DEFLINE, mfp->buf, StringLen(mfp->buf), mfp->mydata);
695         if (mfp->formatdb && is_na) {
696             (*(mfp->myfunc))(bsp, FASTA_FORMATDB_AMB, mfp->buf, StringLen(mfp->buf), mfp->mydata);
697         }
698         else if(!mfp->no_sequence) {
699         if (!mfp->formatdb) {
700         if (is_na)
701             code = Seq_code_iupacna;
702         else
703             code = Seq_code_ncbieaa;
704         } else {
705         code = mfp->code;
706         }
707         if (repr == Seq_repr_virtual && (! mfp->do_virtual)) {
708             StringCpy (mfp->buf, "-");
709                 (*(mfp->myfunc))(bsp, FASTA_SEQLINE, mfp->buf, StringLen(mfp->buf),
710                                  mfp->mydata);
711             (*(mfp->myfunc))(bsp, FASTA_EOS, mfp->buf, StringLen(mfp->buf),
712                              mfp->mydata);
713             return TRUE;
714         }
715             spp = FastaSeqPortEx (bsp, is_na, mfp->do_virtual, code, mfp->seqloc);
716             if (spp == NULL) return FALSE;
717             while (FastaSeqLineEx(spp, mfp->buf, mfp->seqlen, is_na, mfp->do_virtual))
718                 (*(mfp->myfunc))(bsp, FASTA_SEQLINE, mfp->buf, StringLen(mfp->buf),
719                                  mfp->mydata);
720             SeqPortFree(spp);
721             (*(mfp->myfunc))(bsp, FASTA_EOS, mfp->buf, StringLen(mfp->buf),
722                              mfp->mydata);
723         }
724         return TRUE;
725 }
726 
727 /*****************************************************************************
728 *
729 *   BioseqFastaStream (bsp, fp, flags, linelen, blocklen, grouplen, do_defline)
730 *
731 *       Rapid FASTA generator using SeqPortStream
732 *
733 *****************************************************************************/
734 
735 typedef struct streamfsa {
736   FILE          *fp;
737   ByteStorePtr  bs;
738   Char          buf [512];
739   Int2          idx;
740   Int2          lin;
741   Int2          blk;
742   Int2          grp;
743   Int2          linelen;
744   Int2          blocklen;
745   Int2          grouplen;
746   Int2          skip;
747   BIG_ID        gi;
748   Int4          start;
749   Int4          seqpos;
750   Boolean       seqspans;
751 } StreamFsa, PNTR StreamFsaPtr;
752 
FsaStreamProc(CharPtr sequence,Pointer userdata)753 static void LIBCALLBACK FsaStreamProc (
754   CharPtr sequence,
755   Pointer userdata
756 )
757 
758 {
759   Char          ch;
760   StreamFsaPtr  sfp;
761   Char          spn [64];
762 
763   if (StringHasNoText (sequence) || userdata == NULL) return;
764   sfp = (StreamFsaPtr) userdata;
765   ch = *sequence;
766   while (ch != '\0' && sfp->skip > 0) {
767     (sfp->skip)--;
768     (sfp->seqpos)++;
769     sequence++;
770     ch = *sequence;
771   }
772   while (ch != '\0') {
773     /* optionally separate blocks with space */
774     if (sfp->blk >= sfp->blocklen && sfp->blocklen > 0) {
775       sfp->buf [sfp->idx] = ' ';
776       (sfp->idx)++;
777       sfp->blk = 0;
778     }
779     /* save sequence character to buffer */
780     sfp->buf [sfp->idx] = ch;
781     (sfp->idx)++;
782     (sfp->lin)++;
783     (sfp->blk)++;
784     /* write sequence as soon as we have line of characters */
785     if (sfp->lin >= sfp->linelen) {
786       sfp->buf [sfp->idx] = '\0';
787       /* optionally separate groups with blank line */
788       if (sfp->grp >= sfp->grouplen && sfp->grouplen > 0) {
789         if (sfp->fp != NULL) {
790           fprintf (sfp->fp, "\n");
791         } else if (sfp->bs != NULL) {
792           BSWrite (sfp->bs, "\n", sizeof ("\n"));
793         }
794         sfp->grp = 0;
795       }
796       /* print actual sequence line here */
797       if (sfp->fp != NULL) {
798         if (sfp->seqspans) {
799           fprintf (sfp->fp, "<span class=\"ff_line\" id=\"gi_%ld_%ld\">", (long) sfp->gi, (long) (sfp->start + 1));
800         }
801         fprintf (sfp->fp, "%s", sfp->buf);
802         if (sfp->seqspans) {
803           fprintf (sfp->fp, "</span>");
804         }
805         fprintf (sfp->fp, "\n");
806       } else if (sfp->bs != NULL) {
807         if (sfp->seqspans) {
808           sprintf (spn, "<span class=\"ff_line\" id=\"gi_%ld_%ld\">", (long) sfp->gi, (long) (sfp->start + 1));
809           BSWrite (sfp->bs, spn, StringLen (spn));
810         }
811         BSWrite (sfp->bs, sfp->buf, StringLen (sfp->buf));
812         if (sfp->seqspans) {
813           BSWrite (sfp->bs, "</span>", sizeof ("</span>"));
814         }
815         BSWrite (sfp->bs, "\n", sizeof ("\n"));
816       }
817       sfp->start = sfp->seqpos + 1;
818       sfp->idx = 0;
819       sfp->lin = 0;
820       sfp->blk = 0;
821       (sfp->grp)++;
822     }
823     (sfp->seqpos)++;
824     sequence++;
825     ch = *sequence;
826   }
827 }
828 
829 /* If Bioseq is a protein, does not have an accession,
830  * and is the only protein in the nuc-prot set or is part of
831  * a sorted protein file, use the nuc bioseq ID in the FASTA
832  * defline.
833  */
834 
ChooseFastaID(BioseqPtr bsp,Boolean allow_mult)835 static SeqIdPtr ChooseFastaID (BioseqPtr bsp, Boolean allow_mult)
836 
837 {
838   BioseqSetPtr bssp;
839   BioseqPtr    nuc_bsp = NULL;
840   SeqIdPtr     sip;
841   if (bsp == NULL) return NULL;
842   if (!ISA_aa(bsp->mol) || bsp->idx.parenttype != OBJ_BIOSEQSET || bsp->idx.parentptr == NULL) {
843     return bsp->id;
844   }
845   /* if protein sequence has an accession, do not use nucleotide ID */
846   sip = bsp->id;
847   while (sip != NULL) {
848     if (sip->choice == SEQID_GENBANK) {
849       return sip;
850     } else {
851       sip = sip->next;
852     }
853   }
854   bssp = (BioseqSetPtr) bsp->idx.parentptr;
855   if (bssp->_class != BioseqseqSet_class_nuc_prot /* not in nuc-prot set */
856       || bssp->seq_set == NULL /* no sequences in set - bad indexing */
857       || bssp->seq_set->next == NULL /* only one sequence in nuc-prot set, degenerate */
858       || (!allow_mult && bssp->seq_set->next->next != NULL) /* more than one protein in nuc-prot set */) {
859     return bsp->id;
860   }
861   if (IS_Bioseq (bssp->seq_set)) {
862     nuc_bsp = bssp->seq_set->data.ptrvalue;
863   } else if (IS_Bioseq_set (bssp->seq_set)) {
864     bssp = bssp->seq_set->data.ptrvalue;
865     if (bssp->_class == BioseqseqSet_class_segset
866         && bssp->seq_set != NULL
867         && IS_Bioseq (bssp->seq_set)) {
868       nuc_bsp = bssp->seq_set->data.ptrvalue;
869     }
870   }
871   if (nuc_bsp == NULL) {
872     return bsp->id;
873   } else {
874     return nuc_bsp->id;
875   }
876 }
877 
AddSubSourceValuesToNucTitle(BioSourcePtr biop,CharPtr str)878 static void AddSubSourceValuesToNucTitle (
879   BioSourcePtr biop,
880   CharPtr str
881 )
882 
883 {
884   Boolean       needsQuotes;
885   CharPtr       ssp_name;
886   SubSourcePtr  ssp;
887   Char          text [256];
888 
889   if (biop == NULL || str == NULL) return;
890   ssp = biop->subtype;
891   while (ssp != NULL) {
892     StringCpy (text, "[");
893     ssp_name = GetSubsourceQualName (ssp->subtype);
894     if (StringHasNoText (ssp_name)) {
895       StringCat (text, "subsource");
896     } else {
897       StringCat (text, ssp_name);
898     }
899     StringToLower (text);
900     needsQuotes = FALSE;
901     if (StringChr (ssp->name, '=') != NULL ||
902         StringChr (ssp->name, '[') != NULL ||
903         StringChr (ssp->name, ']') != NULL) {
904       needsQuotes = TRUE;
905     }
906     StringCat (text, "=");
907     if (needsQuotes) {
908       StringCat (text, "\"");
909     }
910     StringCat (text, ssp->name);
911     if (needsQuotes) {
912       StringCat (text, "\"");
913     }
914     StringCat (text, "] ");
915     StringCat (str, text);
916     ssp = ssp->next;
917   }
918 }
919 
AddOrgModValuesToNucTitle(BioSourcePtr biop,CharPtr str)920 static void AddOrgModValuesToNucTitle (
921   BioSourcePtr biop,
922   CharPtr str
923 )
924 
925 {
926   CharPtr    mod_name;
927   OrgModPtr  mod;
928   Boolean    needsQuotes;
929   Char       text [256];
930 
931   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL || str == NULL) return;
932   mod = biop->org->orgname->mod;
933   while (mod != NULL) {
934     StringCpy (text, "[");
935     mod_name = GetOrgModQualName (mod->subtype);
936     StringCat (text, mod_name);
937     StringToLower (text);
938     needsQuotes = FALSE;
939     if (StringChr (mod->subname, '=') != NULL ||
940         StringChr (mod->subname, '[') != NULL ||
941         StringChr (mod->subname, ']') != NULL) {
942       needsQuotes = TRUE;
943     }
944     StringCat (text, "=");
945     if (needsQuotes) {
946       StringCat (text, "\"");
947     }
948     StringCat (text, mod->subname);
949     if (needsQuotes) {
950       StringCat (text, "\"");
951     }
952     StringCat (text, "] ");
953     StringCat (str, text);
954     mod = mod->next;
955   }
956 }
957 
MakeNucleotideTitleInSequinStyle(BioseqPtr bsp)958 static CharPtr MakeNucleotideTitleInSequinStyle (
959   BioseqPtr bsp
960 )
961 
962 {
963   BioSourcePtr  biop;
964   MolInfoPtr    mip;
965   Boolean       needsQuotes;
966   OrgNamePtr    onp;
967   OrgRefPtr     orp;
968   SeqDescrPtr   sdp;
969   CharPtr       str;
970   Uint1         tech = 0;
971   Char          text [256];
972   CharPtr       tmp;
973 
974   if (bsp == NULL) return NULL;
975   if (! ISA_na (bsp->mol)) return NULL;
976   sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
977   if (sdp == NULL) return NULL;
978   biop = (BioSourcePtr) sdp->data.ptrvalue;
979   if (biop == NULL) return NULL;
980   sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_molinfo, NULL);
981   if (sdp != NULL) {
982     mip = (MolInfoPtr) sdp->data.ptrvalue;
983     if (mip != NULL) {
984       switch (mip->tech) {
985         case MI_TECH_est :
986         case MI_TECH_sts :
987         case MI_TECH_survey :
988         case MI_TECH_htgs_1 :
989         case MI_TECH_htgs_2 :
990         case MI_TECH_htgs_3 :
991         case MI_TECH_fli_cdna :
992         case MI_TECH_htgs_0 :
993         case MI_TECH_htc :
994         case MI_TECH_wgs :
995           tech = mip->tech;
996           break;
997         default :
998           break;
999       }
1000     }
1001   }
1002   str = MemNew (5000);
1003 
1004   orp = biop->org;
1005   if (orp != NULL) {
1006     needsQuotes = FALSE;
1007     if (StringChr (orp->taxname, '=') != NULL ||
1008         StringChr (orp->taxname, '[') != NULL ||
1009         StringChr (orp->taxname, ']') != NULL) {
1010       needsQuotes = TRUE;
1011     }
1012     StringCpy (text, "[organism=");
1013     if (needsQuotes) {
1014       StringCat (text, "\"");
1015     }
1016     StringCat (text, orp->taxname);
1017     if (needsQuotes) {
1018       StringCat (text, "\"");
1019     }
1020     StringCat (text, "] ");
1021     StringCat (str, text);
1022   }
1023 
1024   AddSubSourceValuesToNucTitle (biop, str);
1025 
1026   AddOrgModValuesToNucTitle (biop, str);
1027 
1028   if (tech > 0) {
1029     StringCpy (text, "[tech=");
1030     StringCat (text, TechNameFromTech (tech));
1031     StringCat (text, "] ");
1032     StringCat (str, text);
1033   }
1034 
1035   if (bsp->topology == TOPOLOGY_CIRCULAR) {
1036     StringCat (str, "[topology=circular] ");
1037   }
1038 
1039   if (orp != NULL) {
1040     onp = orp->orgname;
1041     if (onp != NULL) {
1042       if (onp->gcode > 0) {
1043         sprintf (text, "[gcode=%d] ", (int) onp->gcode);
1044         StringCat (str, text);
1045       }
1046       if (onp->mgcode > 0) {
1047         sprintf (text, "[mgcode=%d] ", (int) onp->mgcode);
1048         StringCat (str, text);
1049       }
1050       if (onp->pgcode > 0) {
1051         sprintf (text, "[pgcode=%d] ", (int) onp->pgcode);
1052         StringCat (str, text);
1053       }
1054     }
1055   }
1056 
1057   TrimSpacesAroundString (str);
1058   if (StringHasNoText (str)) {
1059     MemFree (str);
1060     return NULL;
1061   }
1062 
1063   tmp = StringSave (str);
1064   MemFree (str);
1065 
1066   return tmp;
1067 }
1068 
BioseqFastaStreamInternal(BioseqPtr bsp,SeqLocPtr slp,SeqLitPtr lit,CharPtr str,FILE * fp,ByteStorePtr bs,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,Boolean substitute_ids,Boolean sorted_prot,Int2 skip)1069 static Int4 BioseqFastaStreamInternal (
1070   BioseqPtr bsp,
1071   SeqLocPtr slp,
1072   SeqLitPtr lit,
1073   CharPtr str,
1074   FILE *fp,
1075   ByteStorePtr bs,
1076   StreamFlgType flags,
1077   Int2 linelen,
1078   Int2 blocklen,
1079   Int2 grouplen,
1080   Boolean do_defline,
1081   Boolean substitute_ids,
1082   Boolean sorted_prot,
1083   Int2 skip
1084 )
1085 
1086 {
1087   Char         acc [41];
1088   SeqIdPtr     accn = NULL;
1089   Char         buf [4096];
1090   Char         ch, ch1, ch2, ch3;
1091   Int4         count = 0;
1092   BIG_ID       gi = -1;
1093   SeqIdPtr     gpp = NULL;
1094   Char         id [128];
1095   Uint1        id_format = PRINTID_FASTA_LONG;
1096   CharPtr      original_id = NULL;
1097   CharPtr      ptr;
1098   StreamFsa    sf;
1099   SeqIdPtr     sip = NULL;
1100   Char         spn [64];
1101   CharPtr      tmp;
1102 
1103   if (bsp == NULL && slp == NULL && lit == NULL && str == NULL) return 0;
1104   if (fp == NULL && bs == NULL) return 0;
1105   if (bsp != NULL && bsp->repr == Seq_repr_virtual) return 0;
1106   if (linelen > 128) {
1107     linelen = 128;
1108   }
1109   if (linelen < 1) {
1110     linelen = 60;
1111   }
1112   if (blocklen > 100) {
1113     blocklen = 100;
1114   }
1115   if (blocklen < 1) {
1116     blocklen = 0;
1117   }
1118   if (grouplen > 100) {
1119     grouplen = 100;
1120   }
1121   if (grouplen < 1) {
1122     grouplen = 0;
1123   }
1124   acc [0] = '\0';
1125   MemSet ((Pointer) &sf, 0, sizeof (StreamFsa));
1126   sf.fp = fp;
1127   sf.bs = bs;
1128   sf.idx = 0;
1129   sf.lin = 0;
1130   sf.blk = 0;
1131   sf.grp = 0;
1132   sf.linelen = linelen;
1133   sf.blocklen = blocklen;
1134   sf.grouplen = grouplen;
1135   sf.skip = skip;
1136   sf.gi = 0;
1137   sf.start = 0;
1138   sf.seqpos = 0;
1139   sf.seqspans = (Boolean) ((flags & STREAM_HTML_SPANS) != 0);
1140   if (sf.seqspans) {
1141     if (bsp != NULL) {
1142       for (sip = bsp->id; sip != NULL; sip = sip->next) {
1143         switch (sip->choice) {
1144           case SEQID_GI :
1145             gi = sip->data.intvalue;
1146             break;
1147           case SEQID_GENBANK :
1148           case SEQID_EMBL :
1149           case SEQID_DDBJ :
1150           case SEQID_OTHER :
1151             accn = sip;
1152             break;
1153           case SEQID_PIR :
1154           case SEQID_SWISSPROT :
1155           case SEQID_PRF :
1156           case SEQID_PDB :
1157             accn = sip;
1158             break;
1159           case SEQID_TPG :
1160           case SEQID_TPE :
1161           case SEQID_TPD :
1162             accn = sip;
1163             break;
1164           case SEQID_GPIPE :
1165             /* should not override better accession */
1166             gpp = sip;
1167             break;
1168           default :
1169             break;
1170         }
1171       }
1172     } else if (slp != NULL) {
1173       /* PUBSEQ_OS will send a SeqInt with a chain of Seq-ids */
1174       for (sip = SeqLocId (slp); sip != NULL; sip = sip->next) {
1175         switch (sip->choice) {
1176           case SEQID_GI :
1177             gi = sip->data.intvalue;
1178             break;
1179           case SEQID_GENBANK :
1180           case SEQID_EMBL :
1181           case SEQID_DDBJ :
1182           case SEQID_OTHER :
1183             accn = sip;
1184             break;
1185           case SEQID_PIR :
1186           case SEQID_SWISSPROT :
1187           case SEQID_PRF :
1188           case SEQID_PDB :
1189             accn = sip;
1190             break;
1191           case SEQID_TPG :
1192           case SEQID_TPE :
1193           case SEQID_TPD :
1194             accn = sip;
1195             break;
1196           case SEQID_GPIPE :
1197             /* should not override better accession */
1198             gpp = sip;
1199             break;
1200           default :
1201             break;
1202         }
1203       }
1204       if (sip != NULL && sip->choice == SEQID_GI) {
1205         sf.gi = sip->data.intvalue;
1206       }
1207     }
1208     if (gi > 0) {
1209       sf.gi = gi;
1210     }
1211     if (accn == NULL) {
1212       accn = gpp;
1213     }
1214     if (accn != NULL) {
1215       SeqIdWrite (accn, acc, PRINTID_TEXTID_ACC_ONLY, sizeof (acc) - 1);
1216 
1217       if (accn->choice == SEQID_PDB) {
1218         ptr = StringChr (acc, '_');
1219         if (ptr != NULL) {
1220           ch1 = ptr [1];
1221           if (ch1 != '\0') {
1222             ch2 = ptr [2];
1223             if (ch2 != '\0') {
1224               ch3 = ptr [3];
1225               if (ch3 == '\0') {
1226                 if (ch1 == ch2) {
1227                   if (IS_UPPER (ch1)) {
1228                     ptr [1] = TO_LOWER (ch1);
1229                     ptr [2] = '\0';
1230                   }
1231                 }
1232               }
1233             }
1234           }
1235         }
1236       }
1237     }
1238   }
1239   if (do_defline) {
1240     id [0] = '\0';
1241     if (ShouldUseOriginalID (bsp)) {
1242       original_id = FastaGetOriginalId (bsp);
1243     }
1244     if (substitute_ids) {
1245       sip = ChooseFastaID (bsp, sorted_prot);
1246     } else if (bsp != NULL) {
1247       sip = bsp->id;
1248     }
1249     if ((flags & STREAM_ALL_FASTA_IDS) != 0) {
1250       id_format = PRINTID_FASTA_ALL;
1251     }
1252     if (original_id != NULL && StringLen (original_id) + 5 < sizeof (id)) {
1253       sprintf (id, "lcl|%s", original_id);
1254     } else {
1255       SeqIdWrite (sip, id, id_format, sizeof (id) - 1);
1256     }
1257     /* no longer need to do feature indexing if title not present to speed up creation */
1258     /*
1259     sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_title, NULL);
1260     if (sdp == NULL) {
1261       entityID = ObjMgrGetEntityIDForPointer (bsp);
1262       if (! SeqMgrFeaturesAreIndexed (entityID)) {
1263         SeqMgrIndexFeatures (entityID, NULL);
1264       }
1265     }
1266     */
1267     buf [0] = '\0';
1268     if ((flags & STREAM_TAGGED_DEFLINE) != 0) {
1269       str = MakeNucleotideTitleInSequinStyle (bsp);
1270       StringNCpy_0 (buf, str, sizeof (buf));
1271       MemFree (str);
1272     } else {
1273       NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE);
1274     }
1275     tmp = buf;
1276     ch = *tmp;
1277     if (ch == '>') {
1278       *tmp = '_';
1279     }
1280     /*
1281     tmp = buf;
1282     ch = *tmp;
1283     while (ch != '\0') {
1284       if (ch == '>') {
1285         *tmp = '_';
1286       }
1287       tmp++;
1288       ch = *tmp;
1289     }
1290     */
1291     if (sf.fp != NULL) {
1292       fprintf (fp, ">%s %s\n", id, buf);
1293     } else if (sf.bs != NULL) {
1294       BSWrite (sf.bs, ">", sizeof (">"));
1295       BSWrite (sf.bs, id, StringLen (id));
1296       BSWrite (sf.bs, " ", sizeof (" "));
1297       BSWrite (sf.bs, buf, StringLen (buf));
1298       BSWrite (sf.bs, "\n", sizeof ("\n"));
1299     }
1300   }
1301   if (bsp != NULL) {
1302     count = SeqPortStream (bsp, flags, (Pointer) &sf, FsaStreamProc);
1303   } else if (slp != NULL) {
1304     count = SeqPortStreamLoc (slp, flags, (Pointer) &sf, FsaStreamProc);
1305   } else if (lit != NULL) {
1306     count = SeqPortStreamLit (lit, flags, (Pointer) &sf, FsaStreamProc);
1307   } else if (str != NULL) {
1308     count = StringLen (str);
1309     FsaStreamProc (str, (Pointer) &sf);
1310   }
1311   /* print any remaining sequence */
1312   if (sf.lin > 0) {
1313     sf.buf [sf.idx] = '\0';
1314     if (sf.grp >= sf.grouplen && sf.grouplen > 0) {
1315       if (sf.fp != NULL) {
1316         fprintf (fp, "\n");
1317       } else if (sf.bs != NULL) {
1318         BSWrite (sf.bs, "\n", sizeof ("\n"));
1319       }
1320     }
1321     if (sf.fp != NULL) {
1322       if (sf.seqspans) {
1323         fprintf (sf.fp, "<span class=\"ff_line\" id=\"gi_%ld_%ld\">", (long) sf.gi, (long) (sf.start + 1));
1324       }
1325       fprintf (sf.fp, "%s", sf.buf);
1326       if (sf.seqspans) {
1327         fprintf (sf.fp, "</span>");
1328       }
1329       fprintf (sf.fp, "\n");
1330       if (sf.seqspans) {
1331         fprintf (sf.fp, "<script type=\"text/javascript\">");
1332         fprintf (sf.fp, "if (typeof(oData) == \"undefined\") oData = []; ");
1333         fprintf (sf.fp, "oData.push({gi:%ld,acc:\"%s\"})", (long) sf.gi, acc);
1334         fprintf (sf.fp, "</script>\n");
1335       }
1336     } else if (sf.bs != NULL) {
1337       if (sf.seqspans) {
1338         sprintf (spn, "<span class=\"ff_line\" id=\"gi_%ld_%ld\">", (long) sf.gi, (long) (sf.start + 1));
1339         BSWrite (sf.bs, spn, StringLen (spn));
1340       }
1341       BSWrite (sf.bs, sf.buf, StringLen (sf.buf));
1342       if (sf.seqspans) {
1343         BSWrite (sf.bs, "</span>", sizeof ("</span>"));
1344       }
1345       BSWrite (sf.bs, "\n", sizeof ("\n"));
1346       if (sf.seqspans) {
1347         sprintf (spn, "<script type=\"text/javascript\">");
1348         BSWrite (sf.bs, spn, StringLen (spn));
1349         sprintf (spn, "if (typeof(oData) == \"undefined\") oData = []; ");
1350         BSWrite (sf.bs, spn, StringLen (spn));
1351         sprintf (spn, "oData.push({gi:%ld,acc:\"%s\"})", (long) sf.gi, acc);
1352         BSWrite (sf.bs, spn, StringLen (spn));
1353         sprintf (spn, "</script>\n");
1354         BSWrite (sf.bs, spn, StringLen (spn));
1355       }
1356     }
1357   }
1358   return count;
1359 }
1360 
BioseqFastaStream(BioseqPtr bsp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline)1361 NLM_EXTERN Int4 BioseqFastaStream (
1362   BioseqPtr bsp,
1363   FILE *fp,
1364   StreamFlgType flags,
1365   Int2 linelen,
1366   Int2 blocklen,
1367   Int2 grouplen,
1368   Boolean do_defline
1369 )
1370 
1371 {
1372   return BioseqFastaStreamInternal (bsp, NULL, NULL, NULL, fp, NULL, flags,
1373                                     linelen, blocklen, grouplen,
1374                                     do_defline, FALSE, FALSE, 0);
1375 }
1376 
BioseqFastaStreamEx(BioseqPtr bsp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,Boolean substitute_ids,Boolean sorted_protein)1377 NLM_EXTERN Int4 BioseqFastaStreamEx (
1378   BioseqPtr bsp,
1379   FILE *fp,
1380   StreamFlgType flags,
1381   Int2 linelen,
1382   Int2 blocklen,
1383   Int2 grouplen,
1384   Boolean do_defline,
1385   Boolean substitute_ids,
1386   Boolean sorted_protein
1387 )
1388 
1389 {
1390   return BioseqFastaStreamInternal (bsp, NULL, NULL, NULL, fp, NULL, flags,
1391                                     linelen, blocklen, grouplen,
1392                                     do_defline, substitute_ids, sorted_protein, 0);
1393 }
1394 
BioseqFastaMemStream(BioseqPtr bsp,ByteStorePtr bs,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline)1395 NLM_EXTERN Int4 BioseqFastaMemStream (
1396   BioseqPtr bsp,
1397   ByteStorePtr bs,
1398   StreamFlgType flags,
1399   Int2 linelen,
1400   Int2 blocklen,
1401   Int2 grouplen,
1402   Boolean do_defline
1403 )
1404 
1405 {
1406   return BioseqFastaStreamInternal (bsp, NULL, NULL, NULL, NULL, bs, flags,
1407                                     linelen, blocklen, grouplen,
1408                                     do_defline, FALSE, FALSE, 0);
1409 }
1410 
SeqLocFastaStream(SeqLocPtr slp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen)1411 NLM_EXTERN Int4 SeqLocFastaStream (
1412   SeqLocPtr slp,
1413   FILE *fp,
1414   StreamFlgType flags,
1415   Int2 linelen,
1416   Int2 blocklen,
1417   Int2 grouplen
1418 )
1419 
1420 {
1421   if (slp == NULL || fp == NULL) return 0;
1422 
1423   return BioseqFastaStreamInternal (NULL, slp, NULL, NULL, fp, NULL, flags,
1424                                     linelen, blocklen, grouplen,
1425                                     FALSE, FALSE, FALSE, 0);
1426 }
1427 
SeqLitFastaStream(SeqLitPtr lit,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen)1428 NLM_EXTERN Int4 SeqLitFastaStream (
1429   SeqLitPtr lit,
1430   FILE *fp,
1431   StreamFlgType flags,
1432   Int2 linelen,
1433   Int2 blocklen,
1434   Int2 grouplen
1435 )
1436 
1437 {
1438   if (lit == NULL || fp == NULL) return 0;
1439 
1440   return BioseqFastaStreamInternal (NULL, NULL, lit, NULL, fp, NULL, flags,
1441                                     linelen, blocklen, grouplen,
1442                                     FALSE, FALSE, FALSE, 0);
1443 }
1444 
DoSpecialDefline(SeqFeatPtr sfp,FILE * fp,CdRegionPtr crp,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1445 static void DoSpecialDefline (
1446   SeqFeatPtr sfp,
1447   FILE *fp,
1448   CdRegionPtr crp,
1449   CharPtr idSuffix,
1450   SeqLocPtr mappedloc,
1451   BioseqPtr parentbsp
1452 )
1453 
1454 {
1455   BioseqPtr          bsp = NULL;
1456   Char               buf [512];
1457   SeqFeatPtr         cds;
1458   SeqMgrFeatContext  cdscontext;
1459   Boolean            do_defline = TRUE;
1460   Uint2              entityID;
1461   SeqFeatPtr         gene = NULL;
1462   SeqMgrFeatContext  genecontext;
1463   BIG_ID             gi;
1464   GeneRefPtr         grp;
1465   IntAsn2gbJob       iaj;
1466   SeqLocPtr          loc;
1467   Boolean            partial5;
1468   Boolean            partial3;
1469   BioseqPtr          prod;
1470   CharPtr            ptr;
1471   SeqIdPtr           sip;
1472   CharPtr            str;
1473   Char               tmp [64];
1474   Boolean            unlock = FALSE;
1475 
1476   if (sfp == NULL || fp == NULL || crp == NULL) return;
1477 
1478   MemSet ((Pointer) &genecontext, 0, sizeof (SeqMgrFeatContext));
1479   MemSet ((Pointer) &cdscontext, 0, sizeof (SeqMgrFeatContext));
1480 
1481   if (do_defline) {
1482     bsp = BioseqFindFromSeqLoc (sfp->location);
1483     if (bsp == NULL) {
1484       sip = SeqLocId (sfp->location);
1485       if (sip == NULL) {
1486         loc = SeqLocFindNext (sfp->location, NULL);
1487         if (loc != NULL) {
1488           sip = SeqLocId (loc);
1489         }
1490       }
1491       if (sip != NULL) {
1492         bsp = BioseqLockById (sip);
1493         if (bsp != NULL) {
1494           unlock = TRUE;
1495         }
1496       }
1497     }
1498     if (bsp == NULL) {
1499       do_defline = FALSE;
1500       StringCpy (buf, "lcl|");
1501       sip = SeqLocId (sfp->location);
1502       if (sip != NULL) {
1503         SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1504         StringCat (buf, tmp);
1505       }
1506       if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1507         StringCat (buf, idSuffix);
1508       }
1509       FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1510       StringCpy (buf, "?");
1511       FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1512       fflush (fp);
1513     }
1514   }
1515 
1516   if (do_defline && bsp != NULL) {
1517     entityID = ObjMgrGetEntityIDForPointer (bsp);
1518     if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
1519       SeqMgrIndexFeatures (entityID, NULL);
1520     }
1521     cds = SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &cdscontext);
1522     if (sfp != cds) {
1523       do_defline = FALSE;
1524       StringCpy (buf, "lcl|");
1525       sip = SeqIdFindWorst (bsp->id);
1526       if (sip != NULL) {
1527         SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1528         StringCat (buf, tmp);
1529       }
1530       if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1531         StringCat (buf, idSuffix);
1532       }
1533       FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1534       StringCpy (buf, "??");
1535       FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1536       fflush (fp);
1537     }
1538   }
1539 
1540   if (do_defline) {
1541     entityID = ObjMgrGetEntityIDForPointer (bsp);
1542     if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
1543       SeqMgrIndexFeatures (entityID, NULL);
1544     }
1545 
1546     CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
1547 
1548     grp = SeqMgrGetGeneXref (sfp);
1549     if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) {
1550       gene = SeqMgrGetOverlappingGene (sfp->location, &genecontext);
1551     }
1552 
1553     MemSet ((Pointer) &iaj, 0, sizeof (IntAsn2gbJob));
1554     iaj.flags.iupacaaOnly = FALSE;
1555     iaj.relModeError = FALSE;
1556 
1557     if (parentbsp == NULL) {
1558       parentbsp = bsp;
1559     }
1560 
1561     StringCpy (buf, "lcl|");
1562     sip = SeqIdFindWorst (parentbsp->id);
1563     if (sip != NULL) {
1564       SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1565       StringCat (buf, tmp);
1566     }
1567     if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1568       StringCat (buf, idSuffix);
1569     }
1570 
1571     FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1572 
1573     buf [0] = '\0';
1574     if (StringDoesHaveText (genecontext.label)) {
1575       StringCat (buf, "[gene=");
1576       StringCat (buf, genecontext.label);
1577       StringCat (buf, "] ");
1578     }
1579     if (StringDoesHaveText (cdscontext.label)) {
1580       StringCat (buf, "[protein=");
1581       StringCat (buf, cdscontext.label);
1582       StringCat (buf, "] ");
1583     }
1584     if (crp->frame == 2) {
1585       StringCat (buf, "[frame=2] ");
1586     } else if (crp->frame == 3) {
1587       StringCat (buf, "[frame=3] ");
1588     }
1589     if (partial5 && partial3) {
1590       StringCat (buf, "[partial=5',3'] ");
1591     } else if (partial5) {
1592       StringCat (buf, "[partial=5'] ");
1593     } else if (partial3) {
1594       StringCat (buf, "[partial=3'] ");
1595     }
1596     if (sfp->product != NULL) {
1597       tmp [0] = '\0';
1598       sip = SeqLocId (sfp->product);
1599       if (sip != NULL && sip->choice == SEQID_GI) {
1600         prod = BioseqFind (sip);
1601         if (prod != NULL) {
1602           sip = SeqIdFindWorst (prod->id);
1603           SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1604         } else {
1605           gi = sip->data.intvalue;
1606           sip = GetSeqIdForGI (gi);
1607           SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp));
1608           SeqIdFree (sip);
1609         }
1610       } else if (sip != NULL) {
1611         SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp));
1612       }
1613       if (StringDoesHaveText (tmp)) {
1614         StringCat (buf, "[protein_id=");
1615         StringCat (buf, tmp);
1616         StringCat (buf, "] ");
1617       }
1618     }
1619     if (mappedloc == NULL) {
1620       mappedloc = sfp->location;
1621     }
1622 
1623     str = FFFlatLoc (&iaj, bsp, mappedloc, FALSE, FALSE);
1624 
1625     ptr = (CharPtr) MemNew ((StringLen (buf) + StringLen (str) + 30) * sizeof (Char));
1626     if (ptr != NULL) {
1627       StringCpy (ptr, buf);
1628       if (str != NULL) {
1629         StringCat (ptr, "[location=");
1630         StringCat (ptr, str);
1631         StringCat (ptr, "] ");
1632       }
1633       TrimSpacesAroundString (ptr);
1634 
1635       FastaFileFunc (bsp, FASTA_DEFLINE, ptr, StringLen (ptr), (Pointer) fp);
1636 
1637       MemFree (ptr);
1638     }
1639 
1640     MemFree (str);
1641 
1642     fflush (fp);
1643 
1644     if (unlock) {
1645       BioseqUnlock (bsp);
1646     }
1647   }
1648 }
1649 
CdRegionFastaStreamEx(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1650 NLM_EXTERN Int4 CdRegionFastaStreamEx (
1651   SeqFeatPtr sfp,
1652   FILE *fp,
1653   StreamFlgType flags,
1654   Int2 linelen,
1655   Int2 blocklen,
1656   Int2 grouplen,
1657   Boolean do_defline,
1658   CharPtr idSuffix,
1659   SeqLocPtr mappedloc,
1660   BioseqPtr parentbsp
1661 )
1662 
1663 {
1664   CdRegionPtr  crp;
1665   Int2         skip = 0;
1666 
1667   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return 0;
1668   if (fp == NULL) return 0;
1669   crp = (CdRegionPtr) sfp->data.value.ptrvalue;
1670   if (crp == NULL) return 0;
1671 
1672   if (do_defline) {
1673     DoSpecialDefline (sfp, fp, crp, idSuffix, mappedloc, parentbsp);
1674   }
1675 
1676   if (crp->frame == 2) {
1677     skip = 1;
1678   } else if (crp->frame == 3) {
1679     skip = 2;
1680   }
1681 
1682   return BioseqFastaStreamInternal (NULL, sfp->location, NULL, NULL, fp, NULL, flags,
1683                                     linelen, blocklen, grouplen,
1684                                     FALSE, FALSE, FALSE, skip);
1685 }
1686 
CdRegionFastaStream(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix)1687 NLM_EXTERN Int4 CdRegionFastaStream (
1688   SeqFeatPtr sfp,
1689   FILE *fp,
1690   StreamFlgType flags,
1691   Int2 linelen,
1692   Int2 blocklen,
1693   Int2 grouplen,
1694   Boolean do_defline,
1695   CharPtr idSuffix
1696 )
1697 
1698 {
1699   return CdRegionFastaStreamEx (sfp, fp, flags, linelen, blocklen, grouplen,
1700                                 do_defline, idSuffix, NULL, NULL);
1701 }
1702 
TranslationFastaStreamEx(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1703 NLM_EXTERN Int4 TranslationFastaStreamEx (
1704   SeqFeatPtr sfp,
1705   FILE *fp,
1706   StreamFlgType flags,
1707   Int2 linelen,
1708   Int2 blocklen,
1709   Int2 grouplen,
1710   Boolean do_defline,
1711   CharPtr idSuffix,
1712   SeqLocPtr mappedloc,
1713   BioseqPtr parentbsp
1714 )
1715 
1716 {
1717   ByteStorePtr  bs;
1718   Char          ch;
1719   Int4          count = 0;
1720   CdRegionPtr   crp;
1721   size_t        prtlen;
1722   CharPtr       ptr;
1723   CharPtr       str;
1724 
1725   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return 0;
1726   if (fp == NULL) return 0;
1727   crp = (CdRegionPtr) sfp->data.value.ptrvalue;
1728   if (crp == NULL) return 0;
1729 
1730   if (do_defline) {
1731     DoSpecialDefline (sfp, fp, crp, idSuffix, mappedloc, parentbsp);
1732   }
1733 
1734   str = NULL;
1735   bs = ProteinFromCdRegionEx (sfp, TRUE, FALSE);
1736   str = BSMerge (bs, NULL);
1737   bs = BSFree (bs);
1738 
1739   if (str != NULL) {
1740     ptr = str;
1741     ch = *ptr;
1742     while (ch != '\0') {
1743       *ptr = TO_UPPER (ch);
1744       ptr++;
1745       ch = *ptr;
1746     }
1747     prtlen = StringLen (str);
1748     if (prtlen > 1) {
1749        if (str [prtlen - 1] == '*') {
1750          str [prtlen - 1] = '\0';
1751        }
1752     }
1753   }
1754 
1755   count = BioseqFastaStreamInternal (NULL, NULL, NULL, str, fp, NULL, flags,
1756                                      linelen, blocklen, grouplen,
1757                                      FALSE, FALSE, FALSE, 0);
1758 
1759   MemFree (str);
1760 
1761   return count;
1762 }
1763 
TranslationFastaStream(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix)1764 NLM_EXTERN Int4 TranslationFastaStream (
1765   SeqFeatPtr sfp,
1766   FILE *fp,
1767   StreamFlgType flags,
1768   Int2 linelen,
1769   Int2 blocklen,
1770   Int2 grouplen,
1771   Boolean do_defline,
1772   CharPtr idSuffix
1773 )
1774 
1775 {
1776   return TranslationFastaStreamEx (sfp, fp, flags, linelen, blocklen, grouplen,
1777                                    do_defline, idSuffix, NULL, NULL);
1778 }
1779 
DoGeneDefline(SeqFeatPtr sfp,FILE * fp,GeneRefPtr grp,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1780 static void DoGeneDefline (
1781   SeqFeatPtr sfp,
1782   FILE *fp,
1783   GeneRefPtr grp,
1784   CharPtr idSuffix,
1785   SeqLocPtr mappedloc,
1786   BioseqPtr parentbsp
1787 )
1788 
1789 {
1790   BioseqPtr          bsp = NULL;
1791   Char               buf [512];
1792   Boolean            do_defline = TRUE;
1793   Uint2              entityID;
1794   SeqMgrFeatContext  genecontext;
1795   IntAsn2gbJob       iaj;
1796   Boolean            partial5;
1797   Boolean            partial3;
1798   SeqIdPtr           sip;
1799   CharPtr            str;
1800   Char               tmp [64];
1801   Boolean            unlock = FALSE;
1802 
1803   if (sfp == NULL || fp == NULL || grp == NULL) return;
1804   if (sfp == NULL || fp == NULL || sfp->data.choice != SEQFEAT_GENE) return;
1805   grp = (GeneRefPtr) sfp->data.value.ptrvalue;
1806   if (grp == NULL) return;
1807 
1808   if (do_defline) {
1809     bsp = BioseqFindFromSeqLoc (sfp->location);
1810     if (bsp == NULL) {
1811       sip = SeqLocId (sfp->location);
1812       if (sip != NULL) {
1813         bsp = BioseqLockById (sip);
1814         if (bsp != NULL) {
1815           unlock = TRUE;
1816         }
1817       }
1818     }
1819     if (bsp == NULL) {
1820       do_defline = FALSE;
1821       StringCpy (buf, "lcl|");
1822       sip = SeqLocId (sfp->location);
1823       if (sip != NULL) {
1824         SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1825         StringCat (buf, tmp);
1826       }
1827       if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1828         StringCat (buf, idSuffix);
1829       }
1830       FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1831       StringCpy (buf, "?");
1832       FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1833       fflush (fp);
1834     }
1835   }
1836 
1837   if (do_defline && bsp != NULL) {
1838     if (sfp != SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &genecontext)) {
1839       do_defline = FALSE;
1840       StringCpy (buf, "lcl|");
1841       sip = SeqIdFindWorst (bsp->id);
1842       if (sip != NULL) {
1843         SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1844         StringCat (buf, tmp);
1845       }
1846       if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1847         StringCat (buf, idSuffix);
1848       }
1849       FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1850       StringCpy (buf, "??");
1851       FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1852       fflush (fp);
1853     }
1854   }
1855 
1856   if (do_defline) {
1857     entityID = ObjMgrGetEntityIDForPointer (bsp);
1858     if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
1859       SeqMgrIndexFeatures (entityID, NULL);
1860     }
1861 
1862     CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
1863 
1864     MemSet ((Pointer) &iaj, 0, sizeof (IntAsn2gbJob));
1865     iaj.flags.iupacaaOnly = FALSE;
1866     iaj.relModeError = FALSE;
1867 
1868     if (parentbsp == NULL) {
1869       parentbsp = bsp;
1870     }
1871 
1872     StringCpy (buf, "lcl|");
1873     sip = SeqIdFindWorst (parentbsp->id);
1874     if (sip != NULL) {
1875       SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1876       StringCat (buf, tmp);
1877     }
1878     if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1879       StringCat (buf, idSuffix);
1880     }
1881 
1882     FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1883 
1884     buf [0] = '\0';
1885     if (StringDoesHaveText (grp->locus)) {
1886       StringCat (buf, "[gene=");
1887       StringCat (buf, grp->locus);
1888       StringCat (buf, "] ");
1889     }
1890     if (StringDoesHaveText (grp->locus_tag)) {
1891       StringCat (buf, "[locus_tag=");
1892       StringCat (buf, grp->locus_tag);
1893       StringCat (buf, "] ");
1894     }
1895     if (StringLen (buf) == 0 && StringDoesHaveText (genecontext.label)) {
1896       StringCat (buf, "[gene=");
1897       StringCat (buf, genecontext.label);
1898       StringCat (buf, "] ");
1899     }
1900     if (mappedloc == NULL) {
1901       mappedloc = sfp->location;
1902     }
1903     str = FFFlatLoc (&iaj, bsp, mappedloc, FALSE, FALSE);
1904     if (str != NULL && StringLen (str) + StringLen (buf) < sizeof (buf) - 10) {
1905       StringCat (buf, "[location=");
1906       StringCat (buf, str);
1907       StringCat (buf, "] ");
1908       MemFree (str);
1909     }
1910     TrimSpacesAroundString (buf);
1911 
1912     FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1913 
1914     fflush (fp);
1915 
1916     if (unlock) {
1917       BioseqUnlock (bsp);
1918     }
1919   }
1920 }
1921 
GeneFastaStreamEx(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1922 NLM_EXTERN Int4 GeneFastaStreamEx (
1923   SeqFeatPtr sfp,
1924   FILE *fp,
1925   StreamFlgType flags,
1926   Int2 linelen,
1927   Int2 blocklen,
1928   Int2 grouplen,
1929   Boolean do_defline,
1930   CharPtr idSuffix,
1931   SeqLocPtr mappedloc,
1932   BioseqPtr parentbsp
1933 )
1934 
1935 {
1936   GeneRefPtr  grp;
1937 
1938   if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE) return 0;
1939   if (fp == NULL) return 0;
1940   grp = (GeneRefPtr) sfp->data.value.ptrvalue;
1941   if (grp == NULL) return 0;
1942 
1943   if (do_defline) {
1944     DoGeneDefline (sfp, fp, grp, idSuffix, mappedloc, parentbsp);
1945   }
1946 
1947   return BioseqFastaStreamInternal (NULL, sfp->location, NULL, NULL, fp, NULL, flags,
1948                                     linelen, blocklen, grouplen,
1949                                     FALSE, FALSE, FALSE, 0);
1950 }
1951 
GeneFastaStream(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix)1952 NLM_EXTERN Int4 GeneFastaStream (
1953   SeqFeatPtr sfp,
1954   FILE *fp,
1955   StreamFlgType flags,
1956   Int2 linelen,
1957   Int2 blocklen,
1958   Int2 grouplen,
1959   Boolean do_defline,
1960   CharPtr idSuffix
1961 )
1962 
1963 {
1964   return GeneFastaStreamEx (sfp, fp, flags, linelen, blocklen, grouplen,
1965                             do_defline, idSuffix, NULL, NULL);
1966 }
1967 
1968 /*****************************************************************************
1969 *
1970 *   SeqEntryFastaStream (bsp, fp, flags, linelen, blocklen, grouplen,
1971 *                        do_na, do_aa, master_style)
1972 *
1973 *       Rapid FASTA generator on ASN.1 record including GenBank release set
1974 *
1975 *****************************************************************************/
1976 
1977 typedef struct fastastreamdata {
1978   FILE           *fp;
1979   StreamFlgType  flags;
1980   Int2           linelen;
1981   Int2           blocklen;
1982   Int2           grouplen;
1983   Boolean        do_na;
1984   Boolean        do_aa;
1985   Boolean        master_style;
1986   Boolean        failed;
1987   Int4           count;
1988   Boolean        substitute_ids;
1989   Boolean        sorted_prot;
1990 } FastaStreamData, PNTR FastaStreamPtr;
1991 
GetSegParts(BioseqPtr bsp)1992 static BioseqSetPtr GetSegParts (
1993   BioseqPtr bsp
1994 )
1995 
1996 {
1997   BioseqSetPtr  bssp;
1998   SeqEntryPtr   sep;
1999   if (bsp == NULL || bsp->repr != Seq_repr_seg) return NULL;
2000   sep = bsp->seqentry;
2001   if (sep == NULL) return NULL;
2002   sep = sep->next;
2003   if (sep == NULL || (! IS_Bioseq_set (sep))) return NULL;
2004   bssp = (BioseqSetPtr) sep->data.ptrvalue;
2005   if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) return bssp;
2006   return NULL;
2007 }
2008 
FastaOneBioseq(BioseqPtr bsp,Pointer userdata)2009 static void FastaOneBioseq (
2010   BioseqPtr bsp,
2011   Pointer userdata
2012 )
2013 
2014 {  Int4            count;
2015   FastaStreamPtr  fsp;
2016   BioseqSetPtr    parts;
2017   if (bsp == NULL) return;
2018   fsp = (FastaStreamPtr) userdata;
2019   if (fsp == NULL) return;
2020   /* return if molecule not right for format */
2021   if (ISA_na (bsp->mol)) {
2022     if (! fsp->do_na) return;
2023   } else if (ISA_aa (bsp->mol)) {
2024     if (! fsp->do_aa) return;
2025   }
2026   if (bsp->repr == Seq_repr_seg && (! fsp->master_style)) {
2027     /* if bsp followed by parts set, recurse to make FASTA from individual parts */
2028     parts = GetSegParts (bsp);
2029     if (parts != NULL) {
2030       VisitBioseqsInSet (parts, (Pointer) fsp, FastaOneBioseq);
2031       return;
2032     }
2033   }
2034   if (bsp->repr == Seq_repr_raw ||
2035       bsp->repr == Seq_repr_seg ||
2036       bsp->repr == Seq_repr_const ||
2037       bsp->repr == Seq_repr_delta ||
2038       bsp->repr == Seq_repr_ref ||
2039       bsp->repr == Seq_repr_virtual) {
2040     count = BioseqFastaStreamEx (bsp, fsp->fp, fsp->flags, fsp->linelen, fsp->blocklen, fsp->grouplen,
2041                                  TRUE, fsp->substitute_ids, fsp->sorted_prot);
2042     if (count < 0) {
2043       fsp->failed = TRUE;
2044       fsp->count -= count;
2045     } else {
2046       fsp->count += count;
2047     }
2048   }
2049 }
2050 
SeqEntryFastaStreamEx(SeqEntryPtr sep,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_na,Boolean do_aa,Boolean master_style,Boolean substitute_ids,Boolean sorted_prot)2051 NLM_EXTERN Int4 SeqEntryFastaStreamEx (
2052   SeqEntryPtr sep,
2053   FILE *fp,
2054   StreamFlgType flags,
2055   Int2 linelen,
2056   Int2 blocklen,
2057   Int2 grouplen,
2058   Boolean do_na,
2059   Boolean do_aa,
2060   Boolean master_style,
2061   Boolean substitute_ids,
2062   Boolean sorted_prot
2063 )
2064 
2065 {  BioseqPtr        bsp = NULL;
2066   BioseqSetPtr     bssp = NULL;
2067   Uint2            entityID = 0;
2068   FastaStreamData  fsd;
2069   SeqEntryPtr      oldscope;
2070   if (sep == NULL || fp == NULL) return 0;
2071   if (IS_Bioseq (sep)) {
2072     bsp = (BioseqPtr) sep->data.ptrvalue;
2073     entityID = ObjMgrGetEntityIDForPointer (bsp);
2074   } else if (IS_Bioseq_set (sep)) {
2075     bssp = (BioseqSetPtr) sep->data.ptrvalue;
2076     entityID = ObjMgrGetEntityIDForPointer (bssp);
2077   }
2078   if (entityID == 0) return 0;
2079   /* AssignIDs sets bsp->seqentry so GetSegParts can work */
2080   AssignIDsInEntity (entityID, 0, NULL);
2081   fsd.fp = fp;
2082   fsd.flags = flags;
2083   fsd.linelen = linelen;
2084   fsd.blocklen = blocklen;
2085   fsd.grouplen = grouplen;
2086   fsd.do_na = do_na;
2087   fsd.do_aa = do_aa;
2088   fsd.master_style = master_style;
2089   fsd.failed = FALSE;
2090   fsd.count = 0;
2091   fsd.substitute_ids = substitute_ids;
2092   fsd.sorted_prot = sorted_prot;
2093   oldscope = SeqEntrySetScope (sep);
2094   if (bssp != NULL) {
2095     /* handle all components of a pop/phy/mut/eco set */
2096     sep = SeqMgrGetSeqEntryForData (bssp);
2097     VisitSequencesInSep (sep, (Pointer) &fsd, VISIT_MAINS, FastaOneBioseq);
2098   } else {
2099     /* handle single bioseq, which may be segmented or a local part */
2100     FastaOneBioseq (bsp, (Pointer) &fsd);
2101   }
2102   SeqEntrySetScope (oldscope);
2103   if (fsd.failed) {
2104     return -fsd.count;
2105   }
2106   return fsd.count;
2107 }
2108 
SeqEntryFastaStream(SeqEntryPtr sep,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_na,Boolean do_aa,Boolean master_style)2109 NLM_EXTERN Int4 SeqEntryFastaStream (
2110   SeqEntryPtr sep,
2111   FILE *fp,
2112   StreamFlgType flags,
2113   Int2 linelen,
2114   Int2 blocklen,
2115   Int2 grouplen,
2116   Boolean do_na,
2117   Boolean do_aa,
2118   Boolean master_style
2119 )
2120 
2121 {  return SeqEntryFastaStreamEx (sep, fp, flags, linelen, blocklen, grouplen, do_na, do_aa, master_style, FALSE, FALSE);
2122 }
2123 
MakeFastaStreamIdSuffix(SeqFeatPtr sfp,Uint4 idx,CharPtr prefix,CharPtr buf,Boolean do_product,Boolean do_feat_id)2124 NLM_EXTERN void MakeFastaStreamIdSuffix (
2125   SeqFeatPtr sfp,
2126   Uint4 idx,
2127   CharPtr prefix,
2128   CharPtr buf,
2129   Boolean do_product,
2130   Boolean do_feat_id
2131 )
2132 
2133 {
2134   Char       fbuf [64];
2135   BIG_ID     gi;
2136   BioseqPtr  pbsp;
2137   Char       pbuf [64];
2138   SeqIdPtr   sip;
2139 
2140   if (sfp == NULL || buf == NULL) return;
2141 
2142   StringCpy (buf, prefix);
2143   fbuf [0] = '\0';
2144   pbuf [0] = '\0';
2145   if (do_product && sfp->product != NULL) {
2146     pbsp = BioseqFindFromSeqLoc (sfp->product);
2147     if (pbsp != NULL) {
2148       SeqIdWrite (pbsp->id, pbuf, PRINTID_TEXTID_ACC_VER, sizeof (pbuf) - 1);
2149     } else {
2150       sip = SeqLocId (sfp->product);
2151       if (sip != NULL && sip->choice == SEQID_GI) {
2152         gi = sip->data.intvalue;
2153         sip = GetSeqIdForGI (gi);
2154         if (sip != NULL) {
2155           SeqIdWrite (sip, pbuf, PRINTID_TEXTID_ACC_VER, sizeof (pbuf) - 1);
2156         }
2157       }
2158     }
2159   }
2160   if (StringDoesHaveText (pbuf)) {
2161     StringCat (buf, "_");
2162     StringCat (buf, pbuf);
2163   }
2164   if (do_feat_id && idx > 0) {
2165     sprintf (fbuf, "%ld", (long) idx);
2166     StringCat (buf, "_");
2167     StringCat (buf, fbuf);
2168   }
2169 }
2170 
2171 /*****************************************************************************
2172 *
2173 *   Here are functions that convert FASTA format from file or from memory
2174 *
2175 *****************************************************************************/
2176 /********* DEFINES *********/
2177 #define FTSE_BUFF_CHUNK 4096
2178 #define BIOSEQ 1
2179 /********* INTERNAL FUNCTIONS *********/
2180 NLM_EXTERN SeqEntryPtr FastaToSeqEntryInternalEx
2181 (
2182  VoidPtr input,          /* input pointer (file or memory) */
2183  Int4 type,              /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2184  CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2185  Boolean is_na,          /* type of sequence */
2186  CharPtr PNTR errormsg,  /* error messge for debugging */
2187  Boolean parseSeqId,     /* Parse SeqID from def line */
2188  CharPtr special_symbol, /* Returns special symbol if no SeqEntry */
2189  CharPtr prefix,         /* prefix for localID if not parsable */
2190  Int2Ptr ctrptr,         /* starting point for constructing unique ID */
2191  SeqLocPtr PNTR mask_ptr /* Pointer to a SeqLoc to Fill with Masking information */
2192  );
2193 NLM_EXTERN SeqEntryPtr FastaToSeqEntryInternal
2194 (
2195  VoidPtr input,          /* input pointer (file or memory) */
2196  Int4 type,              /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2197  CharPtr PNTR last_char, /* returned pointer to next FASTA sequence */
2198  Boolean is_na,          /* type of sequence */
2199  CharPtr PNTR errormsg,  /* error messge for debugging */
2200  Boolean parseSeqId,     /* Parse SeqID from def line */
2201  CharPtr special_symbol  /* Returns special symbol if no SeqEntry */
2202  );
2203 static Boolean FastaReadSequenceInternal
2204 (
2205  VoidPtr input,          /* input pointer (file or memory) */
2206  Int4 type,              /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2207  CharPtr PNTR last_char, /* returned pointer to next FASTA sequence */
2208  Boolean is_na,          /* type of sequence */
2209  Int4Ptr seq_length,     /* Returned length of sequence in residues */
2210  ByteStorePtr PNTR,      /* Returned pointer to sequence ByteStore */
2211  CharPtr PNTR errormsg,  /* error messge for debugging */
2212  CharPtr special_symbol  /* Returns special symbol if no SeqEntry */
2213  );
2214 static Boolean FastaReadSequenceInternalEx
2215 (
2216  VoidPtr input,          /* input pointer (file or memory) */
2217  Int4 type,              /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2218  CharPtr PNTR last_char, /* returned pointer to next FASTA sequence */
2219  Boolean is_na,          /* type of sequence */
2220  Int4Ptr seq_length,     /* Returned length of sequence in residues */
2221  ByteStorePtr PNTR,      /* Returned pointer to sequence ByteStore */
2222  CharPtr PNTR errormsg,  /* error messge for debugging */
2223  CharPtr special_symbol, /* Returns special symbol if no SeqEntry */
2224  SeqLocPtr PNTR mask_ptr,/* Pointer to a SeqLoc to Fill with Masking information */
2225  SeqIdPtr sip            /* SeqId of current sequence used for Masking Info */
2226  );
2227 static Int4 FastaReadSequenceChunk
2228 (
2229  VoidPtr input,          /* input pointer (file or memory) */
2230  Int4    type,           /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2231  CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2232  Uint1Ptr sequence,      /* buffer to read sequence to */
2233  Int4     length,        /* size of buffer */
2234  CharPtr special_symbol  /* Returns special symbol if no SeqEntry */
2235  );
2236 static SeqEntryPtr FastaToSeqEntryInternalExEx
2237 (
2238  VoidPtr input,           /* input pointer (file or memory) */
2239  Int4 type,               /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2240  CharPtr PNTR next_char,  /* returned pointer to next FASTA sequence */
2241  Boolean is_na,           /* type of sequence */
2242  CharPtr PNTR errormsg,   /* error messge for debugging */
2243  Boolean parseSeqId,      /* Parse SeqID from def line */
2244  CharPtr special_symbol,  /* Returns special symbol if no SeqEntry */
2245  CharPtr prefix,          /* prefix for localID if not parsable */
2246  Int2Ptr ctrptr,          /* starting point for constructing unique ID */
2247  SeqLocPtr PNTR mask_ptr, /* Pointer to a SeqLoc to Fill with Masking information */
2248  Boolean trustID
2249  );
2250 /********* FINCTIONS *********/
2251 /*****************************************************************************
2252 *
2253 *   SeqEntryPFtr FastaToSeqBuffEx() - function to return SeqEntryPtr from
2254 *                                    buffer with error handling
2255 *
2256 *****************************************************************************/
FastaToSeqBuffEx(CharPtr buffer,CharPtr PNTR last_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId)2257 NLM_EXTERN SeqEntryPtr FastaToSeqBuffEx
2258   (
2259     CharPtr buffer,         /* buffer in memory with FASTA sequence */
2260     CharPtr PNTR last_char, /* here returned pointer to next FASTA if any */
2261     Boolean is_na,          /* type of sequence */
2262     CharPtr PNTR errormsg,  /* error message for debugging */
2263     Boolean parseSeqId      /* Parse SeqID from def line */
2264   )
2265 {
2266   return FastaToSeqEntryInternal((void *)buffer, FASTA_MEM_IO ,
2267                                  last_char, is_na, errormsg, parseSeqId, NULL);
2268 }
FastaToSeqBuffForDb(CharPtr buffer,CharPtr PNTR last_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr prefix,Int2Ptr ctrptr,SeqLocPtr PNTR mask_ptr)2269 NLM_EXTERN SeqEntryPtr FastaToSeqBuffForDb
2270   (
2271     CharPtr buffer,         /* buffer in memory with FASTA sequence */
2272     CharPtr PNTR last_char, /* here returned pointer to next FASTA if any */
2273     Boolean is_na,          /* type of sequence */
2274     CharPtr PNTR errormsg,  /* error message for debugging */
2275     Boolean parseSeqId,     /* Parse SeqID from def line */
2276     CharPtr prefix,         /* prefix for localID if not parsable */
2277     Int2Ptr ctrptr,         /* starting point for constructing unique ID */
2278     SeqLocPtr PNTR mask_ptr /* Pointer to a SeqLoc to Fill with Masking information from lowercased letters */
2279   )
2280 {
2281   return FastaToSeqEntryInternalExEx((void *)buffer, FASTA_MEM_IO ,
2282                      last_char, is_na, errormsg, parseSeqId,
2283                      NULL, prefix, ctrptr, mask_ptr, TRUE);
2284 }
2285 /*****************************************************************************
2286 *
2287 *   SeqEntryPtr FastaToSeqEntryEx() - function to return SeqEntryPtr from
2288 *                                     file with error handling
2289 *
2290 *****************************************************************************/
FastaToSeqEntryEx(FILE * fp,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId)2291 NLM_EXTERN SeqEntryPtr FastaToSeqEntryEx
2292   (
2293     FILE *fp,               /* file to get sequence from */
2294     Boolean is_na,          /* type of sequence */
2295     CharPtr PNTR errormsg,  /* error message for debugginq */
2296     Boolean parseSeqId      /* Parse SeqID from def line */
2297   )
2298 {
2299   return FastaToSeqEntryInternal((void *)fp, FASTA_FILE_IO,
2300                                  NULL,is_na, errormsg, parseSeqId, NULL);
2301 }
2302 /*****************************************************************************
2303 *
2304 *   SeqEntryPtr FastaToSeqEntryForDb() - function to return SeqEntryPtr from
2305 *                                     file with error handling and with control
2306 *                                     over generation of unique SeqIDs
2307 *
2308 *****************************************************************************/
FastaToSeqEntryForDb(FILE * fp,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr prefix,Int2Ptr ctrptr,SeqLocPtr PNTR mask_ptr)2309 NLM_EXTERN SeqEntryPtr FastaToSeqEntryForDb
2310   (
2311     FILE *fp,               /* file to get sequence from */
2312     Boolean is_na,          /* type of sequence */
2313     CharPtr PNTR errormsg,  /* error message for debugginq */
2314     Boolean parseSeqId,     /* Parse SeqID from def line */
2315     CharPtr prefix,         /* prefix for localID if not parsable */
2316     Int2Ptr ctrptr,         /* starting point for constructing unique ID */
2317     SeqLocPtr PNTR mask_ptr /* Pointer to a SeqLoc to Fill with Masking information from lowercased letters */
2318   )
2319 {
2320   return FastaToSeqEntryInternalExEx ((void *) fp, FASTA_FILE_IO,
2321                                  NULL, is_na, errormsg, parseSeqId,
2322                                  NULL, prefix, ctrptr, mask_ptr, TRUE);
2323 }
2324 /*****************************************************************************
2325 *
2326 *   SeqEntryPtr FastaToSeqEntry() - function to return SeqEntryPtr from
2327 *                                   file without error handling
2328 *
2329 *****************************************************************************/
FastaToSeqEntry(FILE * fp,Boolean is_na)2330 NLM_EXTERN SeqEntryPtr FastaToSeqEntry (FILE *fp, Boolean is_na)
2331 {
2332   return FastaToSeqEntryEx (fp, is_na, NULL, TRUE);
2333 }
2334 /*****************************************************************************
2335 *
2336 *   SeqEntryPtr FastaToSeqBuff() - function to return SeqEntryPtr from
2337 *                                   buffer without error handling
2338 *
2339 *****************************************************************************/
FastaToSeqBuff(CharPtr buffer,CharPtr PNTR last_char,Boolean is_na)2340 NLM_EXTERN SeqEntryPtr FastaToSeqBuff (CharPtr buffer, CharPtr PNTR last_char,
2341                                    Boolean is_na)
2342 {
2343   return FastaToSeqBuffEx (buffer, last_char, is_na, NULL, TRUE);
2344 }
2345 /*****************************************************************************
2346 *
2347 *   Boolean FastaReadSequenceChunk() - read sequence chunkfrom
2348 *                                      file or buffer for use in
2349 *                                      FastaReadsequenceInternal()
2350 *****************************************************************************/
FastaReadSequenceChunk(VoidPtr input,Int4 type,CharPtr PNTR next_char,Uint1Ptr sequence,Int4 length,CharPtr special_symbol)2351 static Int4 FastaReadSequenceChunk
2352 (
2353  VoidPtr input,          /* input pointer (file or memory) */
2354  Int4    type,           /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2355  CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2356  Uint1Ptr sequence,      /* buffer to read sequence to */
2357  Int4     length,        /* size of buffer */
2358  CharPtr special_symbol  /* Returns special symbol if no SeqEntry */
2359  )
2360 {
2361     const Char PNTR firstchar;
2362     FILE *fd;
2363     register Int4 i;
2364     Int2 ch = 0;
2365     /* Type of input depends upon calling function */
2366     if(type == FASTA_FILE_IO) {
2367     fd = (FILE *) input;
2368     /* Skip empty lines and lines starting with a comment symbol. */
2369     while (1) {
2370         ch = NLM_GETC(fd);
2371         /* Ignore lines starting with a comment symbol. */
2372         if (ch == '!' || ch == '#') {
2373             do {
2374                 ch = NLM_GETC(fd);
2375             } while (ch != '\n' && ch != '\r' && ch != '\0' && ch != EOF);
2376         }
2377         /* If end of file reached, return 0. */
2378         if (ch == EOF)
2379             return 0;
2380         /* If line not empty, break out of this loop. */
2381         if (ch != '\n' && ch != '\r')
2382             break;
2383     }
2384     if(ch == '>' || ch == '&' || ch == '{' || ch == '}' || ch == '[' || ch == ']')
2385     {
2386         ungetc(ch, fd);
2387         if (special_symbol != NULL) {
2388             *special_symbol = (Char) ch;
2389         }
2390         return 0;
2391     }
2392     sequence[0] = (Uint1) ch;
2393     if ((fgets((CharPtr) sequence+1, length-1, fd)) == NULL) {
2394         sequence [1] = '\0';
2395     }
2396     } else {   /* type == FASTA_MEM_IO */
2397         if((firstchar = (const Char PNTR) input) == NULL)
2398             return 0;
2399     }
2400     if(type == FASTA_FILE_IO) {
2401         for(i=0; i < length; i++) {
2402         if (sequence[i] == '\n' || sequence[i] == '\r' || sequence[i] == '\0')
2403         break;
2404         }
2405     } else { /* type = FASTA_MEM_IO */
2406         for(i =0; i < length && (ch = *firstchar) != NULLB; firstchar++, i++) {
2407             if((sequence[i] = (Char) ch) == '>' || (Char) ch == '&' || (Char) ch == '{' ||
2408                 (Char) ch == '}' || (Char) ch == '[' || (Char) ch == ']') {
2409                 if((i == 0) ||
2410                    (i > 0 && (sequence[i-1] == '\n' ||
2411                               sequence[i-1] == '\r'))) {
2412                     if (special_symbol != NULL) {
2413                         *special_symbol = (Char) ch;
2414                     }
2415                     break;
2416                 }
2417             }
2418         }
2419         if(ch == NULLB) /* the end of buffer */
2420             *next_char = NULL;
2421         else
2422             *next_char = (CharPtr) firstchar;
2423     }
2424     return i;
2425 }
2426 /*****************************************************************************
2427 *
2428 *   Boolean FastaReadSequence() - read sequence from file
2429 *
2430 *****************************************************************************/
FastaReadSequence(FILE * fd,Boolean is_na,Int4Ptr seq_length,ByteStorePtr PNTR bs_out,CharPtr PNTR errormsg)2431 Boolean FastaReadSequence
2432 (
2433  FILE *fd,            /* input pointer (file or memory) */
2434  Boolean is_na,            /* type of sequence */
2435  Int4Ptr seq_length,       /* Returned length of sequence in residues */
2436  ByteStorePtr PNTR bs_out, /* Returned pointer to sequence ByteStore */
2437  CharPtr PNTR errormsg     /* error message for debugging */
2438  )
2439 {
2440     return  FastaReadSequenceInternal((VoidPtr) fd, FASTA_FILE_IO, NULL,
2441                                       is_na, seq_length, bs_out, errormsg, NULL);
2442 }
2443 /*****************************************************************************
2444 *
2445 *   Boolean FastaReadSequenceMem() - read sequence from buffer
2446 *
2447 *****************************************************************************/
FastaReadSequenceMem(CharPtr buffer,CharPtr PNTR next_char,Boolean is_na,Int4Ptr seq_length,ByteStorePtr PNTR bs_out,CharPtr PNTR errormsg)2448 Boolean FastaReadSequenceMem
2449 (
2450  CharPtr buffer,           /* input buffer with sequence */
2451  CharPtr PNTR next_char,   /* returned pointer to next FASTA sequence */
2452  Boolean is_na,            /* type of sequence */
2453  Int4Ptr seq_length,       /* Returned length of sequence in residues */
2454  ByteStorePtr PNTR bs_out, /* Returned pointer to sequence ByteStore */
2455  CharPtr PNTR errormsg     /* error message for debugging */
2456 )
2457 {
2458     return  FastaReadSequenceInternal((VoidPtr) buffer, FASTA_MEM_IO,
2459                                       next_char, is_na, seq_length, bs_out,
2460                                       errormsg, NULL);
2461 }
2462 /*****************************************************************************
2463 *
2464 *   Boolean FastaReadSequenceInternal() - read sequence from
2465 *                                         file or buffer for internal use
2466 *
2467 *****************************************************************************/
FastaReadSequenceInternal(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,Int4Ptr seq_length,ByteStorePtr PNTR bs_out,CharPtr PNTR errormsg,CharPtr special_symbol)2468 static Boolean FastaReadSequenceInternal
2469 (
2470  VoidPtr input,            /* input pointer (file or memory) */
2471  Int4 type,                /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2472  CharPtr PNTR next_char,   /* returned pointer to next FASTA sequence */
2473  Boolean is_na,            /* type of sequence */
2474  Int4Ptr seq_length,       /* Returned length of sequence in residues */
2475  ByteStorePtr PNTR bs_out, /* Returned pointer to sequence ByteStore */
2476  CharPtr PNTR errormsg,    /* error message for debugging */
2477  CharPtr special_symbol  /* Returns special symbol if no SeqEntry */
2478 )
2479 {
2480     return FastaReadSequenceInternalEx(input,type,next_char,is_na,seq_length,bs_out,errormsg,special_symbol,NULL, NULL);
2481 }
2482 /*****************************************************************************
2483 *
2484 *   Boolean FastaReadSequenceInternalEx() - read sequence from
2485 *                                         file or buffer for internal use
2486 *                                         and Create Masked SeqLoc of Lowercase sequences.
2487 *
2488 *****************************************************************************/
FastaReadSequenceInternalEx(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,Int4Ptr seq_length,ByteStorePtr PNTR bs_out,CharPtr PNTR errormsg,CharPtr special_symbol,SeqLocPtr PNTR mask_ptr,SeqIdPtr sip)2489 static Boolean FastaReadSequenceInternalEx
2490 (
2491  VoidPtr input,            /* input pointer (file or memory) */
2492  Int4 type,                /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2493  CharPtr PNTR next_char,   /* returned pointer to next FASTA sequence */
2494  Boolean is_na,            /* type of sequence */
2495  Int4Ptr seq_length,       /* Returned length of sequence in residues */
2496  ByteStorePtr PNTR bs_out, /* Returned pointer to sequence ByteStore */
2497  CharPtr PNTR errormsg,    /* error message for debugging */
2498  CharPtr special_symbol,   /* Returns special symbol if no SeqEntry */
2499  SeqLocPtr PNTR mask_ptr,  /* Pointer to a SeqLoc to Fill with Masking information */
2500  SeqIdPtr sip            /* SeqId of current sequence used for Masking Info */
2501 )
2502 {
2503     SeqMapTablePtr smtp;
2504     Uint1Ptr       in_buff, out_buff;
2505     CharPtr        ptr, chptr;
2506     Int2           ch;
2507     Uint1          byte_from, uch;
2508     register Int4  i;
2509     CharPtr        badchar = NULL;
2510     Int4           in_index, out_index, total_read, badchars = 0;
2511     Int4           total_length = 0;
2512     Int4           mask_to;
2513     Char           tmp[32];
2514     ValNodePtr     mask_head,mask,mask_new;
2515     SeqIntPtr      mask_sint;
2516     Boolean        Second, skip_to_eol, last_was_star;
2517     Boolean        this_char_masked;
2518     if (input == NULL)     /* empty input */
2519         return FALSE;
2520     /* Initializing conversion tables */
2521     if(is_na) {
2522         if((smtp = SeqMapTableFind(Seq_code_ncbi4na,
2523                                    Seq_code_iupacna)) == NULL) {
2524             return FALSE;
2525         }
2526     } else {
2527         if((smtp = SeqMapTableFind(Seq_code_ncbistdaa,
2528                                    Seq_code_ncbieaa)) == NULL) {
2529             return FALSE;
2530         }
2531     }
2532     /* Allocationg error message buffers if required */
2533     if (errormsg != NULL) {
2534         *errormsg = NULL;
2535         if((badchar = (CharPtr) MemNew(256)) == NULL)
2536             return FALSE;
2537     }
2538     if((in_buff = (Uint1Ptr) MemNew(FTSE_BUFF_CHUNK)) == NULL)
2539         return FALSE;
2540     if((out_buff = (Uint1Ptr) MemNew(FTSE_BUFF_CHUNK)) == NULL)
2541         return FALSE;
2542     if((*bs_out = BSNew(FTSE_BUFF_CHUNK)) == NULL)
2543         return FALSE;
2544     Second = FALSE;
2545     skip_to_eol = FALSE;
2546     last_was_star = FALSE;
2547     in_index = out_index = total_read = 0;
2548     if(mask_ptr) {
2549         mask_head=mask=NULL;
2550         mask_sint=NULL;
2551         this_char_masked=FALSE;
2552     }
2553     while(TRUE) {
2554         if (in_index == total_read) {
2555             if((total_read = FastaReadSequenceChunk(input, type,
2556                                                     next_char, in_buff,
2557                                                     FTSE_BUFF_CHUNK, special_symbol)) == 0)
2558                 break; /* Here is exit from the loop */
2559             if(type == FASTA_MEM_IO)
2560                 input = (VoidPtr) *next_char;
2561             in_index = 0;
2562         }
2563           byte_from = in_buff[in_index];
2564           in_index++;
2565         if ((! is_na) && (! last_was_star) && byte_from == '*') {
2566             last_was_star = TRUE;
2567         } else if(byte_from != ';' && !skip_to_eol) {
2568             if(mask_ptr) {
2569                 if(IS_LOWER(byte_from)) {
2570                     if(this_char_masked) {
2571                         mask_to++;
2572                     } else { /* First lowercase character in this segment */
2573                         this_char_masked = TRUE;
2574                         /* save previous segment if any */
2575                         mask_new = ValNodeNew(NULL);
2576                         mask_new->choice = SEQLOC_INT;
2577                         if(mask_sint) {
2578                             mask_sint->to = mask_to;
2579                             mask->next = mask_new;
2580                         } else {
2581                             mask_head = mask_new;
2582                         }
2583                         mask = mask_new;
2584                         mask_sint = SeqIntNew();
2585                         mask_sint->from = total_length;
2586                         mask_sint->to = total_length;
2587                         mask_to = total_length;
2588                         mask_sint->strand = Seq_strand_both;
2589                         mask_sint->id = SeqIdDup(sip);
2590                         mask_new->data.ptrvalue = mask_sint;
2591                     }
2592                 } else {
2593                     this_char_masked = FALSE;
2594                 }
2595             }
2596             byte_from = TO_UPPER (byte_from);
2597             if (is_na && byte_from == 'U') byte_from = 'T';
2598             if (is_na && byte_from == 'X') byte_from = 'N';
2599             if((uch = SeqMapTableConvert(smtp, byte_from)) !=
2600                INVALID_RESIDUE && byte_from != '-') {
2601                 if (last_was_star) {
2602                     total_length++;
2603                     out_buff[out_index] = SeqMapTableConvert(smtp, '*');
2604                     out_index++;
2605                     if(out_index == FTSE_BUFF_CHUNK) {
2606                         if(BSWrite(*bs_out, out_buff, out_index) != out_index) {
2607                             MemFree(badchar);
2608                             MemFree(in_buff);
2609                             MemFree(out_buff);
2610                             return FALSE;
2611                         }
2612                         out_index = 0;
2613                     }
2614                     last_was_star = FALSE;
2615                 }
2616                 total_length++;
2617                 if(is_na) {
2618                     if(!Second) {
2619                         uch <<= 4;
2620                         out_buff[out_index] = uch;
2621                     } else {
2622                         out_buff[out_index] += uch;
2623                         out_index++;
2624                     }
2625                     Second = !Second;
2626                 } else {
2627                     out_buff[out_index] = uch;
2628                     out_index++;
2629                 }
2630             } else if (errormsg != NULL){
2631                 if(IS_ALPHA(byte_from) || byte_from == '-' || byte_from == '?') {
2632                     (badchar [(int) (byte_from)])++;
2633                     badchars++;
2634                 }
2635             }
2636         } else {    /* ch == ';' */
2637             /* We have to ignore rest of the line */
2638             skip_to_eol = TRUE;
2639             while(in_index < total_read  &&
2640                   (byte_from = in_buff[in_index]) != '\n' &&
2641                   byte_from != '\r')
2642                 in_index++;
2643     /* Do not skip other passes if a line-return has
2644     been encountered as shown by examining less than the total
2645     (for FASTA_MEM_IO) or finding a line-return (FASTA_FILE_IO). */
2646             if(in_index < total_read ||
2647         (in_index < FTSE_BUFF_CHUNK &&
2648         (in_buff[in_index] == '\n' || in_buff[in_index] == '\r')))
2649                 skip_to_eol = FALSE;
2650         }
2651         if(out_index == FTSE_BUFF_CHUNK) {
2652             if(BSWrite(*bs_out, out_buff, out_index) != out_index) {
2653                 MemFree (badchar);
2654                 MemFree(in_buff);
2655                 MemFree(out_buff);
2656                 return FALSE;
2657             }
2658             out_index = 0;
2659         }
2660     }  /* while (TRUE) */
2661     /* We have to write remaining stuff in out_buff */
2662     if(is_na && Second) out_index++; /* Partial byte for DNA */
2663     if(BSWrite(*bs_out, out_buff, out_index) != out_index) {
2664         MemFree (badchar);
2665         MemFree(in_buff);
2666         MemFree(out_buff);
2667         return FALSE;
2668     }
2669     *seq_length = total_length;
2670     /* If required bad characters statistics */
2671     if (errormsg != NULL && badchars > 0) {
2672         if((ptr = (CharPtr) MemNew (sizeof(Char)*512)) == NULL)
2673             return FALSE;
2674         chptr = "";
2675         sprintf (ptr, "%ld illegal %s %s removed:\n", (long) badchars,
2676                  badchars == 1 ? "character" : "characters",
2677                  badchars == 1 ? "was" : "were"
2678                  );
2679         for (ch = 'A', i =0; ch <= 'Z'; ch++, i++) {
2680             if ((badchar[ch]) > 0) {
2681                 sprintf (tmp, "%s%d %c%s",
2682                          chptr, (int) badchar[ch], ch,
2683                          badchar[ch] == 1 ? "" : "s");
2684                 StringCat (ptr, tmp);
2685                 chptr = ", ";
2686             }
2687         }
2688         ch = '-';
2689         if ((badchar[ch]) > 0) {
2690             sprintf (tmp, "%s%d %c%s",
2691                      chptr, badchar[ch], ch,
2692                      badchar[ch] == 1 ? "" : "s");
2693             StringCat (ptr, tmp);
2694             chptr = ", ";
2695         }
2696         ch = '?';
2697         if ((badchar[ch]) > 0) {
2698             sprintf (tmp, "%s%d %c%s",
2699                      chptr, badchar[ch], ch,
2700                      badchar[ch] == 1 ? "" : "s");
2701             StringCat (ptr, tmp);
2702             chptr = ", ";
2703         }
2704         *errormsg = StringSave (ptr);
2705         MemFree (ptr);
2706     }
2707     MemFree (badchar);
2708     MemFree(in_buff);
2709     MemFree(out_buff);
2710     if(mask_ptr && mask_head) {
2711         SeqLocPtr slp;
2712         if(mask_sint) {
2713             mask_sint->to = mask_to;
2714             mask->next = NULL;
2715         }
2716         slp = SeqLocNew(NULL);
2717         slp->choice = SEQLOC_PACKED_INT;
2718         slp->data.ptrvalue = mask_head;
2719         *mask_ptr = slp;
2720     }
2721     return TRUE;
2722 }
MakeTrustedID(CharPtr prefix,Int2Ptr ctrptr)2723 static SeqIdPtr MakeTrustedID (CharPtr prefix, Int2Ptr ctrptr)
2724 {
2725   Char buf[128];
2726   ValNodePtr newid;
2727   ObjectIdPtr oid;
2728   Int2 start = 1;
2729     if (ctrptr != NULL) {
2730         start = *ctrptr;
2731     }
2732     if (start < 1) {
2733         start = 1;
2734     }
2735         if (prefix)
2736            sprintf (buf, "%d_%.32s", (int) start, prefix);
2737         else
2738            sprintf(buf, "%d", (int) start);
2739     newid = ValNodeNew (NULL);
2740     oid = ObjectIdNew ();
2741     newid->choice = SEQID_LOCAL;
2742     newid->data.ptrvalue = oid;
2743     oid->str = StringSave (buf);
2744     if (ctrptr != NULL) {
2745         *ctrptr = start + 1;
2746     }
2747     return newid;
2748 }
FastaToSeqEntryInternalExEx(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr special_symbol,CharPtr prefix,Int2Ptr ctrptr,SeqLocPtr PNTR mask_ptr,Boolean trustID)2749 static SeqEntryPtr FastaToSeqEntryInternalExEx
2750 (
2751  VoidPtr input,           /* input pointer (file or memory) */
2752  Int4 type,               /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2753  CharPtr PNTR next_char,  /* returned pointer to next FASTA sequence */
2754  Boolean is_na,           /* type of sequence */
2755  CharPtr PNTR errormsg,   /* error messge for debugging */
2756  Boolean parseSeqId,      /* Parse SeqID from def line */
2757  CharPtr special_symbol,  /* Returns special symbol if no SeqEntry */
2758  CharPtr prefix,          /* prefix for localID if not parsable */
2759  Int2Ptr ctrptr,          /* starting point for constructing unique ID */
2760  SeqLocPtr PNTR mask_ptr, /* Pointer to a SeqLoc to Fill with Masking information */
2761  Boolean trustID
2762  )
2763 {
2764     SeqEntryPtr    sep = NULL;
2765     BioseqPtr      bsp = NULL;
2766     ValNodePtr     vnp = NULL;
2767     Int2           ch;
2768     CharPtr        chptr = NULL, ptr = NULL;
2769     register Int4  i;
2770     CharPtr        defline, buffer= NULL;   /* Working buffers */
2771     Int4           BuffSize = FTSE_BUFF_CHUNK;
2772     long           len = 0;
2773     FILE           *fd;
2774     const Char     PNTR firstchar;
2775     Boolean        is_gap = FALSE;
2776     if (special_symbol != NULL) {
2777         *special_symbol = '\0';
2778     }
2779     if (input == NULL)     /* empty input */
2780         return NULL;
2781     /* Type of input depends upon calling function */
2782     if(type == FASTA_FILE_IO)
2783         fd = (FILE *) input;
2784     else    /* type == FASTA_MEM_IO */
2785         firstchar = (const Char PNTR) input;
2786     /* Rolling spaces to check first non-space character */
2787     if(type == FASTA_FILE_IO) {
2788         do {
2789             ch = NLM_GETC(fd);
2790             if (ch == '!' || ch == '#') { /* comment symbol - ignore rest of line */
2791                 do {
2792                     ch = NLM_GETC(fd);
2793                 } while (ch != '\n' && ch != '\r' && ch != '\0' && ch != EOF);
2794             }
2795         } while (IS_WHITESP(ch));
2796     } else {   /* if(type == FASTA_MEM_IO*/
2797         while (IS_WHITESP(ch = *firstchar)) /* Rolling spaces */
2798             firstchar++;
2799     }
2800     if(ch == EOF || ch == NULLB || ch == '&' || ch == '{' ||
2801        ch == '}' || ch == '[' || ch == ']') {
2802         /* This is empty FILE or buffer or special symbol detected */
2803         if (special_symbol != NULL) {
2804             *special_symbol = ch;
2805         }
2806         return NULL;
2807     }
2808     /* First character is valid: initializing main structures */
2809     /* Initializing Seq-entry structure */
2810     if((sep = SeqEntryNew()) == NULL) {
2811         MemFree(buffer);
2812         return NULL;
2813     }
2814     sep->choice = BIOSEQ;  /* == 1 */
2815     if((bsp = BioseqNew()) == NULL) {
2816         MemFree(buffer);
2817         return NULL;
2818     }
2819     sep->data.ptrvalue = bsp;
2820     SeqMgrSeqEntry (SM_BIOSEQ, (Pointer)bsp, sep);
2821     if (is_na) {
2822         bsp->mol = Seq_mol_na;
2823         bsp->seq_data_type = Seq_code_ncbi4na;
2824     } else {
2825         bsp->mol = Seq_mol_aa;
2826         bsp->seq_data_type = Seq_code_ncbistdaa;
2827     }
2828     bsp->repr = Seq_repr_raw;
2829     /*  ------------- */
2830     /* Now reading defline into memory */
2831   /* DEFLINE PROCCESSING*/
2832     if(ch == '>') {     /* Defline is present - processing */
2833         if((buffer = (CharPtr) MemNew(BuffSize+1)) == NULL)
2834             return NULL;
2835         if(type == FASTA_FILE_IO) {    /* File */
2836             buffer[0] = (Char) ch;
2837         i = 0;
2838         fgets(buffer+1, BuffSize, fd);
2839         while (1)
2840         {
2841         while (i<BuffSize-1)
2842         {
2843                     if(buffer[i] == '\n' || buffer[i] == '\r' || buffer[i] == NULLB)
2844             {
2845                 buffer[i] = NULLB;
2846                             break;
2847             }
2848             i++;
2849         }
2850         if (i == BuffSize-1 && (buffer[i] == '\n' || buffer[i] == '\r'))
2851         {
2852             buffer[i] = NULLB;
2853         }
2854         if (buffer[i] == NULLB)
2855             break;
2856                 BuffSize = i + FTSE_BUFF_CHUNK;
2857                 if((buffer = (CharPtr)Realloc(buffer, BuffSize+1)) == NULL)
2858         {
2859                           ErrLogPrintf("Error re-allocating memory in FastaToSeqEntry");
2860                         MemFree(buffer);
2861                         return NULL;
2862         }
2863             fgets(buffer+i+1, FTSE_BUFF_CHUNK, fd);
2864         }
2865         } else {  /* type = FASTA_MEM_IO */
2866             for(i =0; (ch = *firstchar) != NULLB; firstchar++, i++) {
2867                 if (i >= BuffSize) {
2868                     BuffSize = i + FTSE_BUFF_CHUNK;
2869                     buffer = (CharPtr) Realloc(buffer, BuffSize);
2870                 }
2871                 if((buffer[i] = (Char) ch) == '\n' || ch == '\r') {
2872                     break;
2873                 }
2874             }
2875             buffer[i] = NULLB;
2876             if(ch == NULLB) {/* the end of buffer */
2877                 *next_char = NULL;
2878                 input =  (VoidPtr) "\0";
2879             } else {
2880                 *next_char = (CharPtr) firstchar;
2881                 input = (VoidPtr) firstchar;
2882             }
2883         }
2884         defline = buffer+1;   /* Character after '>' */
2885         if(defline[0] != '?') {
2886             /* Creating standard Seq-id */
2887             ptr = defline;
2888             while (IS_WHITESP(*ptr))
2889                 ptr++;
2890             if (parseSeqId) {
2891                 if (*ptr == '"') {
2892                     ptr++;
2893                     chptr = StringChr (ptr, '"');
2894                 } else {
2895                    for (chptr = ptr; *chptr != NULLB && !IS_WHITESP(*chptr);
2896                         chptr++) continue;
2897                    if (*chptr == NULLB)
2898                       chptr = NULL;
2899                 }
2900             }
2901             if (!parseSeqId) {
2902                 chptr = ptr;
2903             } else if (chptr != NULL) {
2904                 *chptr = NULLB;
2905                 chptr++;
2906                 bsp->id = MakeSeqID (ptr);
2907             } else if (*ptr != NULLB) {
2908                 bsp->id = MakeSeqID (ptr);
2909             }
2910             if (bsp->id == NULL) {
2911                 if (trustID) {
2912                   bsp->id = MakeTrustedID (prefix, ctrptr);
2913                 } else {
2914                   bsp->id = MakeNewProteinSeqIdExMT (NULL, NULL, prefix, ctrptr, TRUE);
2915                 }
2916             }
2917             if (chptr != NULL) {
2918                 if((vnp = SeqDescrNew(NULL)) != NULL) {
2919                     vnp->choice = Seq_descr_title;
2920                     while (IS_WHITESP(*chptr))
2921                         chptr++;
2922                     vnp->data.ptrvalue = StringSave (chptr);
2923                 }
2924                 bsp->descr = vnp;
2925             }
2926         } else {
2927             /* Unknown Seq-id */
2928             ptr = defline + 1;
2929             while (IS_WHITESP(*ptr))
2930                 ptr++;
2931             if (StringNCmp (ptr, "unk100", 6) == 0) {
2932               bsp->id = MakeSeqID ("lcl|unk100");
2933               ptr += 3;
2934             } else {
2935               bsp->id = MakeSeqID ("lcl|gap");
2936             }
2937             bsp->repr = Seq_repr_virtual;
2938             if(*ptr != '\0' && sscanf(ptr, "%ld", &len) == 1 && len > 0) {
2939                 bsp->length =  (Int4) len;
2940             } else {
2941                 bsp->length = -1;
2942             }
2943             is_gap = TRUE;
2944         }
2945         MemFree(buffer);
2946     } else {  /* if ch == '>' EMPTY DEFLINE */
2947         /* Defline is upsent - creating default defline */
2948         if (trustID) {
2949           bsp->id = MakeTrustedID (prefix, ctrptr);
2950         } else {
2951           bsp->id = MakeNewProteinSeqIdExMT (NULL, NULL, prefix, ctrptr, TRUE);
2952         }
2953         if(type == FASTA_FILE_IO)
2954             ungetc(ch, fd);
2955     }
2956     SeqMgrAddToBioseqIndex (bsp);
2957     /* OK, now processing sequence */
2958     if (! is_gap) {
2959         if(!FastaReadSequenceInternalEx(input, type, next_char, is_na,
2960                                         &bsp->length,
2961                                         (ByteStorePtr PNTR) &(bsp->seq_data),
2962                                         errormsg, special_symbol,
2963                                         mask_ptr, bsp->id)) {
2964             ErrPostEx(SEV_FATAL, 0, 0, "Failure to read sequence. "
2965                       "FastaToSeqEntry() failed.\n");
2966             return NULL;
2967         }
2968     }
2969     BioseqPack(bsp);     /* Trying to pack Bioseq more */
2970     return sep;
2971 }
FastaToSeqEntryInternalEx(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr special_symbol,CharPtr prefix,Int2Ptr ctrptr,SeqLocPtr PNTR mask_ptr)2972 NLM_EXTERN SeqEntryPtr FastaToSeqEntryInternalEx
2973 (
2974  VoidPtr input,          /* input pointer (file or memory) */
2975  Int4 type,              /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2976  CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2977  Boolean is_na,          /* type of sequence */
2978  CharPtr PNTR errormsg,  /* error messge for debugging */
2979  Boolean parseSeqId,     /* Parse SeqID from def line */
2980  CharPtr special_symbol, /* Returns special symbol if no SeqEntry */
2981  CharPtr prefix,         /* prefix for localID if not parsable */
2982  Int2Ptr ctrptr,         /* starting point for constructing unique ID */
2983  SeqLocPtr PNTR mask_ptr /* Pointer to a SeqLoc to Fill with Masking information */
2984  )
2985 {
2986   return FastaToSeqEntryInternalExEx (input, type, next_char, is_na, errormsg,
2987                                       parseSeqId, special_symbol, prefix, ctrptr,
2988                                       mask_ptr, FALSE);
2989 }
FastaToSeqEntryInternal(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr special_symbol)2990 NLM_EXTERN SeqEntryPtr FastaToSeqEntryInternal
2991 (
2992  VoidPtr input,          /* input pointer (file or memory) */
2993  Int4 type,              /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2994  CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2995  Boolean is_na,          /* type of sequence */
2996  CharPtr PNTR errormsg,  /* error messge for debugging */
2997  Boolean parseSeqId,     /* Parse SeqID from def line */
2998  CharPtr special_symbol  /* Returns special symbol if no SeqEntry */
2999  )
3000 {
3001     return FastaToSeqEntryInternalEx (input, type, next_char, is_na, errormsg,
3002                                           parseSeqId, special_symbol, NULL, NULL,NULL);
3003 }
3004 /*****************************************************************************
3005 *
3006 *   FastaId(bsp, buf, buflen)
3007 *      Makes the string for the id part of fasta format.
3008 *      buf should be at least 40 bytes
3009 *
3010 *****************************************************************************/
FastaId(BioseqPtr bsp,CharPtr buf,Uint4 buflen)3011 NLM_EXTERN Boolean FastaId(BioseqPtr bsp, CharPtr buf, Uint4 buflen)
3012 {
3013     if ((bsp == NULL) || (buf == NULL)) return FALSE;
3014     SeqIdWrite(bsp->id, buf, PRINTID_FASTA_LONG, buflen);
3015     return TRUE;
3016 }
3017 
FastaIdX(BioseqPtr bsp,CharPtr buf,Uint4 buflen,Boolean printid_general,SeqLocPtr seqloc)3018 static Boolean FastaIdX(BioseqPtr bsp, CharPtr buf, Uint4 buflen, Boolean printid_general, SeqLocPtr seqloc)
3019 {
3020     Int4 length;
3021     if ((bsp == NULL) || (buf == NULL)) return FALSE;
3022     if (seqloc == NULL || SeqLocLen(seqloc) == bsp->length)
3023     { /* Full sequence is being dumped. */
3024         if (printid_general) {
3025             SeqIdWrite(bsp->id, buf, PRINTID_FASTA_GENERAL, buflen);
3026         } else {
3027             SeqIdWrite(bsp->id, buf, PRINTID_FASTA_LONG, buflen);
3028         }
3029     }
3030     else
3031     {
3032         SeqIdWrite(bsp->id, buf, PRINTID_FASTA_SHORT, buflen);
3033         length = StringLen(buf);
3034         sprintf(buf+length, ":%ld-%ld", (long) (SeqLocStart(seqloc)+1), (long) (SeqLocStop(seqloc)+1));
3035     }
3036     return TRUE;
3037 }
3038 
FastaGetOriginalId(BioseqPtr bsp)3039 NLM_EXTERN CharPtr FastaGetOriginalId (BioseqPtr bsp)
3040 
3041 {
3042     CharPtr        id;
3043     ObjectIdPtr    oip;
3044     SeqDescrPtr    sdp;
3045     UserFieldPtr   ufp;
3046     UserObjectPtr  uop;
3047 
3048   if (bsp == NULL) return NULL;
3049 
3050   for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
3051     if (sdp->choice != Seq_descr_user) continue;
3052     uop = (UserObjectPtr) sdp->data.ptrvalue;
3053     if (uop == NULL) continue;
3054     oip = uop->type;
3055     if (oip == NULL) continue;
3056     if (StringCmp (oip->str, "OrginalID") != 0 && StringCmp (oip->str, "OriginalID") != 0) continue;
3057     for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
3058       oip = ufp->label;
3059       if (oip == NULL) continue;
3060       if (StringCmp (oip->str, "LocalId") != 0) continue;
3061       if (ufp->choice != 1) continue;
3062       id = (CharPtr) ufp->data.ptrvalue;
3063       if (id == NULL) continue;
3064       return id;
3065     }
3066   }
3067 
3068   return NULL;
3069 }
3070 
ShouldUseOriginalID(BioseqPtr bsp)3071 NLM_EXTERN Boolean ShouldUseOriginalID (BioseqPtr bsp)
3072 
3073 {
3074   DbtagPtr  dbt;
3075   SeqIdPtr  sip;
3076 
3077   if (bsp == NULL) return FALSE;
3078 
3079   for (sip = bsp->id; sip != NULL; sip = sip->next) {
3080     switch (sip->choice) {
3081       case SEQID_LOCAL :
3082         break;
3083       case SEQID_GENERAL :
3084         dbt = (DbtagPtr) sip->data.ptrvalue;
3085         if (dbt != NULL) {
3086           if (! IsSkippableDbtag (dbt)) return FALSE;
3087         }
3088         break;
3089       default :
3090         return FALSE;
3091     }
3092   }
3093 
3094   return TRUE;
3095 }
3096 
FastaIdEx(BioseqPtr bsp,CharPtr buf,Uint4 buflen,Boolean prefer_original_ID)3097 NLM_EXTERN Boolean FastaIdEx(BioseqPtr bsp, CharPtr buf, Uint4 buflen, Boolean prefer_original_ID)
3098 {
3099     CharPtr   id;
3100     SeqIdPtr  sip;
3101 
3102     if ((bsp == NULL) || (buf == NULL)) return FALSE;
3103     if (prefer_original_ID) {
3104       sip = bsp->id;
3105       if (ShouldUseOriginalID (bsp)) {
3106         id = FastaGetOriginalId (bsp);
3107         if (id != NULL && StringLen (id) + 5 < buflen) {
3108           sprintf (buf, "lcl|%s", id);
3109           return TRUE;
3110         }
3111       }
3112     }
3113     SeqIdWrite(bsp->id, buf, PRINTID_FASTA_LONG, buflen);
3114     return TRUE;
3115 }
3116 
3117 /*****************************************************************************
3118 *
3119 *   FastaDefLine(bsp, buf, buflen, accession, organism)
3120 *       Finds or makes a FASTA format defline (just locates the string)
3121 *       buf should be very long if possible
3122 *       function truncates if buf not long enough
3123 *       a few deflines are longer than 255
3124 *
3125 *****************************************************************************/
FastaDefLine(BioseqPtr bsp,CharPtr buf,Uint4 buflen,CharPtr accession,CharPtr organism,Uint1 tech)3126 NLM_EXTERN Boolean FastaDefLine (BioseqPtr bsp, CharPtr buf, Uint4 buflen,
3127                                           CharPtr accession, CharPtr organism, Uint1 tech)
3128 {
3129     BioseqContextPtr bcp;
3130     ValNodePtr vnp;
3131     CharPtr tmp;
3132     PdbBlockPtr pbp;
3133     PatentSeqIdPtr psip;
3134     Uint4 diff, phase;
3135     Int4 num_segs, num_gaps;
3136     Char tbuf[128];
3137     static CharPtr htgs[2] = {
3138         "unordered", "ordered" };
3139     if ((bsp == NULL) || (buf == NULL)) return FALSE;
3140     buflen--;
3141     buf[buflen] = '\0';
3142     if (accession != NULL)
3143     {
3144         diff = LabelCopyExtra(buf, accession, buflen, "(", ") ");
3145         buflen -= diff;
3146         buf += diff;
3147     }
3148     bcp = BioseqContextNew(bsp);
3149     diff = 0;
3150     if ((tmp = BioseqContextGetTitle(bcp)) != NULL) {
3151         diff = LabelCopy(buf, tmp, buflen);
3152                                 /* remove trailing blanks and periods */
3153         tmp = buf + diff - 1;   /* point at last character */
3154         while (((*tmp <= ' ') || (*tmp == '.')) && (diff))
3155         {
3156             *tmp = '\0';
3157             tmp--; diff--;
3158         }
3159     }
3160     else
3161         if ((vnp = BioseqContextGetSeqDescr(bcp, Seq_descr_pdb, NULL, NULL)) != NULL)
3162     {
3163         pbp = (PdbBlockPtr)(vnp->data.ptrvalue);
3164         diff = LabelCopy(buf, (CharPtr)(pbp->compound->data.ptrvalue), buflen);
3165     }
3166     else
3167     {
3168         for (vnp = bsp->id; vnp != NULL; vnp = vnp->next)
3169         {
3170             if (vnp->choice == SEQID_PATENT)
3171             {
3172                 psip = (PatentSeqIdPtr)(vnp->data.ptrvalue);
3173                 sprintf(tbuf, "Sequence %d from Patent %s %s",
3174                     (int)psip->seqid, psip->cit->country, psip->cit->number);
3175                 diff = LabelCopy(buf, tbuf, buflen);
3176                 break;
3177             }
3178         }
3179         if (vnp == NULL)
3180             diff = LabelCopy(buf, "No definition line found", buflen);
3181     }
3182     buflen -= diff;
3183     buf += diff;
3184     BioseqContextFree(bcp);
3185     if (((tech >= MI_TECH_htgs_1) && (tech <= MI_TECH_htgs_3)) ||
3186         (tech == MI_TECH_htgs_0))
3187     {
3188       if (tech == MI_TECH_htgs_0) {
3189         phase = 0;
3190         StringMove(tbuf, ", LOW-PASS SEQUENCE SAMPLING.");
3191       }
3192       else {
3193         phase = (Int2)(tech - MI_TECH_htgs_1 + 1);
3194         if (phase != 3)
3195           StringMove(tbuf, ", WORKING DRAFT SEQUENCE");
3196       }
3197       if (phase != 3) {
3198         diff = LabelCopy(buf, tbuf, buflen);
3199         buflen -= diff;
3200         buf += diff;
3201       }
3202         if (phase == 3)
3203         {
3204             if (tmp && StringStr(tmp, "complete sequence") == NULL) {
3205                 diff = LabelCopy(buf, ", complete sequence", buflen);
3206                 buflen -= diff;
3207                 buf += diff;
3208             }
3209         }
3210         else if ((bsp->repr == Seq_repr_delta) && (phase != 0))
3211         {
3212             if (CountGapsInDeltaSeq(bsp, &num_segs, &num_gaps, NULL, NULL, NULL, 0))
3213             {
3214                 if (num_gaps > 0) {
3215                     sprintf(tbuf, ", %ld %s pieces", (long)(num_gaps + 1), htgs[phase - 1]);
3216                 } else {
3217                     sprintf(tbuf, ", %ld %s piece", (long)(num_gaps + 1), htgs[phase - 1]);
3218                 }
3219                 diff = LabelCopy(buf, tbuf, buflen);
3220                 buflen -= diff;
3221                 buf += diff;
3222             }
3223         }
3224     }
3225     if (organism != NULL)
3226     {
3227         LabelCopyExtra(buf, organism, buflen, " [", "]");
3228     }
3229     return TRUE;
3230 }
is_pdb(BioseqPtr bsp)3231 static Boolean is_pdb(BioseqPtr bsp)
3232 {
3233     SeqIdPtr id;
3234     if (bsp ==NULL)
3235         return FALSE;
3236     for (id = bsp->id; id; id=id->next)
3237     {
3238         if (id->choice == SEQID_PDB)
3239             return TRUE;
3240     }
3241     return FALSE;
3242 }
tie_next(ValNodePtr head,ValNodePtr next)3243 static ValNodePtr tie_next(ValNodePtr head, ValNodePtr next)
3244 {
3245    ValNodePtr v;
3246    if (head == NULL) {
3247       return next;
3248    }
3249    for (v = head; v->next != NULL; v = v->next)
3250            continue;
3251    v->next = next;
3252    return head;
3253 }
get_descr_on_top(GatherContextPtr gcp)3254 static Boolean get_descr_on_top (GatherContextPtr gcp)
3255 {
3256     ValNodePtr    tmp;
3257     DescrInfoPtr    PNTR dspp;
3258     DescrInfoPtr    dsp;
3259     ItemInfoPtr     iip;
3260     dspp = (DescrInfoPtr PNTR) gcp->userdata;
3261     dsp = *dspp;
3262     switch (gcp->thistype) {
3263     case OBJ_SEQDESC:
3264         tmp = (ValNodePtr) (gcp->thisitem);
3265         if (tmp->choice == dsp->choice) {
3266             if (tmp->data.ptrvalue != NULL) {
3267                 dsp->vnp = tmp;
3268                 iip = (ItemInfoPtr) MemNew(sizeof(ItemInfo));
3269                 if(dsp->iip != NULL)
3270                     MemFree(dsp->iip);
3271                 dsp->iip = iip;
3272                 iip->entityID = gcp->entityID;
3273                 iip->itemID = gcp->itemID;
3274                 iip->itemtype = gcp->thistype;
3275             }
3276         }
3277         break;
3278     default:
3279         break;
3280     }
3281     return TRUE;
3282 }
get_descr(GatherContextPtr gcp)3283 static Boolean get_descr (GatherContextPtr gcp)
3284 {
3285     ValNodePtr    tmp;
3286     DescrInfoPtr    PNTR dspp;
3287     DescrInfoPtr    dsp;
3288     ItemInfoPtr     iip;
3289     BioseqPtr         bsp;
3290     dspp = (DescrInfoPtr PNTR) gcp->userdata;
3291     dsp = *dspp;
3292     switch (gcp->thistype)
3293     {
3294         case OBJ_SEQDESC:
3295             tmp = (ValNodePtr) (gcp->thisitem);
3296             if (tmp->choice == dsp->choice) {
3297                 bsp = (BioseqPtr) (gcp->parentitem);
3298                 if (dsp->bsp != bsp) {
3299                     break;
3300                 }
3301                 if (tmp->data.ptrvalue != NULL) {
3302                     dsp->vnp = tmp;
3303                     iip = (ItemInfoPtr) MemNew(sizeof(ItemInfo));
3304                     dsp->iip = iip;
3305                     iip->entityID = gcp->entityID;
3306                     iip->itemID = gcp->itemID;
3307                     iip->itemtype = gcp->thistype;
3308                 }
3309             }
3310             break;
3311         default:
3312             break;
3313     }
3314     return TRUE;
3315 }
GetFeatProt(GatherContextPtr gcp)3316 static Boolean GetFeatProt (GatherContextPtr gcp)
3317 {
3318     ValNodePtr    PNTR vnpp;
3319     ValNodePtr tmp;
3320     SeqFeatPtr    sfp;
3321     vnpp = (ValNodePtr PNTR) gcp->userdata;
3322     switch (gcp->thistype)
3323     {
3324         case OBJ_SEQFEAT:
3325             sfp = (SeqFeatPtr) (gcp->thisitem);
3326             if (sfp->data.choice == SEQFEAT_PROT) {
3327                 tmp = ValNodeNew(NULL);
3328                 tmp->data.ptrvalue = sfp;
3329                 *vnpp = tie_next(*vnpp, tmp);
3330             }
3331             break;
3332         default:
3333             break;
3334     }
3335     return TRUE;
3336 }
GetFeatCDS(GatherContextPtr gcp)3337 static Boolean GetFeatCDS (GatherContextPtr gcp)
3338 {
3339     SeqFeatPtr    PNTR sfpp;
3340     SeqFeatPtr    sfp;
3341     sfpp = (SeqFeatPtr PNTR) gcp->userdata;
3342     switch (gcp->thistype)
3343     {
3344         case OBJ_SEQFEAT:
3345             sfp = (SeqFeatPtr) (gcp->thisitem);
3346             if (sfp->data.choice == SEQFEAT_CDREGION) {
3347                 *sfpp = sfp;
3348                 return FALSE;
3349             }
3350             break;
3351         default:
3352             break;
3353     }
3354     *sfpp = NULL;
3355     return TRUE;
3356 }
GetFeatGenes(GatherContextPtr gcp)3357 static Boolean GetFeatGenes (GatherContextPtr gcp)
3358 {
3359     ValNodePtr    PNTR vnpp;
3360     ValNodePtr tmp;
3361     SeqFeatPtr    sfp;
3362     vnpp = (ValNodePtr PNTR) gcp->userdata;
3363     switch (gcp->thistype)
3364     {
3365         case OBJ_SEQFEAT:
3366             sfp = (SeqFeatPtr) (gcp->thisitem);
3367             if (sfp->data.choice == SEQFEAT_GENE) {
3368                 tmp = ValNodeNew(NULL);
3369                 tmp->data.ptrvalue = sfp;
3370                 *vnpp = tie_next(*vnpp, tmp);
3371             }
3372             break;
3373         default:
3374             break;
3375     }
3376     return TRUE;
3377 }
IndexedGatherDescrOnBioseq(ItemInfoPtr iip,BioseqPtr bsp,Uint1 choice)3378 static ValNodePtr IndexedGatherDescrOnBioseq (ItemInfoPtr iip, BioseqPtr bsp, Uint1 choice)
3379 {
3380     SeqMgrDescContext  dcontext;
3381     SeqDescrPtr        sdp;
3382     sdp = SeqMgrGetNextDescriptor (bsp, NULL, choice, &dcontext);
3383     if (sdp == NULL) return NULL;
3384     if (ISA_aa(bsp->mol) && !is_pdb(bsp)) {
3385         if (dcontext.level != 0) return NULL;
3386     }
3387     if (iip != NULL) {
3388         iip->entityID = dcontext.entityID;
3389         iip->itemID = dcontext.itemID;
3390         iip->itemtype = OBJ_SEQDESC;
3391     }
3392     return sdp;
3393 }
GatherDescrOnBioseq(ItemInfoPtr iip,BioseqPtr bsp,Uint1 choice,Boolean get_first)3394 static ValNodePtr GatherDescrOnBioseq(ItemInfoPtr iip, BioseqPtr bsp, Uint1 choice, Boolean get_first)
3395 {
3396     ValNodePtr    vnp = NULL;
3397     /*
3398     GatherScope   gsc;
3399     SeqLocPtr     slp;
3400     Uint2         bspID;
3401     DescrInfoPtr  dsp;
3402     Uint2         entityID;
3403     */
3404     ObjValNodePtr ovp;
3405   if (ISA_aa(bsp->mol) && !is_pdb(bsp)) {
3406     vnp = BioseqGetSeqDescr (bsp, choice, NULL);
3407   } else {
3408     vnp = GetNextDescriptorUnindexed (bsp, choice, NULL);
3409   }
3410   if (vnp != NULL) {
3411     if (iip != NULL) {
3412       if (vnp->extended != 0) {
3413         ovp = (ObjValNodePtr) vnp;
3414         iip->entityID = ovp->idx.entityID;
3415         iip->itemtype = ovp->idx.itemtype;
3416         iip->itemID = ovp->idx.itemID;
3417       }
3418     }
3419   }
3420   return vnp;
3421 #if 0
3422     entityID = ObjMgrGetEntityIDForPointer (bsp);
3423     if (SeqMgrFeaturesAreIndexed (entityID)) {
3424         return IndexedGatherDescrOnBioseq (iip, bsp, choice);
3425     }
3426     /*
3427     if (iip==NULL && (get_first || (ISA_aa(bsp->mol) && !is_pdb(bsp))) ) {
3428         for(vnp=bsp->descr;vnp && vnp->choice != choice; vnp=vnp->next){}
3429         return vnp;
3430     }
3431     */
3432     if (iip==NULL && get_first)
3433           {
3434             for(vnp=bsp->descr;vnp; vnp=vnp->next)
3435               if(vnp->choice == choice)
3436                 return vnp;
3437           }
3438     dsp = (DescrInfoPtr) MemNew(sizeof(DescrInfo));
3439     dsp->choice = choice;
3440     dsp->bsp = bsp;
3441       MemSet ((Pointer) (&gsc), 0, sizeof (GatherScope));
3442     MemSet ((Pointer) (gsc.ignore), (int)(TRUE), (size_t) (OBJ_MAX * sizeof(Boolean)));
3443     gsc.ignore[OBJ_SEQDESC] = FALSE;
3444     bspID = ObjMgrGetEntityIDForPointer(bsp);
3445     slp = ValNodeNew(NULL);
3446     slp->choice = SEQLOC_WHOLE;
3447     slp->data.ptrvalue = (SeqIdPtr) SeqIdDup (SeqIdFindBest (bsp->id, 0));
3448     gsc.target = slp;
3449     if (ISA_aa(bsp->mol) && !is_pdb(bsp)) {
3450         GatherEntity(bspID, &dsp, get_descr, &gsc);
3451     } else {
3452         GatherEntity(bspID, &dsp, get_descr_on_top, &gsc);
3453     }
3454     SeqLocFree(slp);
3455     vnp = dsp->vnp;
3456     if (vnp && vnp->data.ptrvalue) {
3457         if (iip != NULL) {
3458             iip->entityID = dsp->iip->entityID;
3459             iip->itemID = dsp->iip->itemID;
3460             iip->itemtype = dsp->iip->itemtype;
3461         }
3462         MemFree(dsp->iip);
3463         MemFree(dsp);
3464         return vnp;
3465     }
3466     MemFree(dsp->iip);
3467     MemFree(dsp);
3468     return NULL;
3469 #endif
3470 }
3471 /* more efficient versions of feature gather functions for protein defline */
3472 typedef struct unidxfeatdata {
3473   SeqIdPtr    bspid;
3474   SeqLocPtr   loc;
3475   Int4        longest;
3476   Int4        shortest;
3477   SeqFeatPtr  sfp;
3478 } UndxFeatData, PNTR UndxFeatPtr;
GetLongestProtFeat(SeqFeatPtr sfp,Pointer userdata)3479 static void GetLongestProtFeat (
3480   SeqFeatPtr sfp,
3481   Pointer userdata
3482 )
3483 {
3484   Int4         len;
3485   SeqIdPtr     sip;
3486   UndxFeatPtr  ufp;
3487   if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT) return;
3488   ufp = (UndxFeatPtr) userdata;
3489   if (ufp == NULL) return;
3490   sip = SeqLocId (sfp->location);
3491   if (sip == NULL) return;
3492   if (! SeqIdIn (sip, ufp->bspid)) return;
3493   len = SeqLocLen (sfp->location);
3494   if (len == -1) return;
3495   if (len > ufp->longest) {
3496     ufp->sfp = sfp;
3497     ufp->longest = len;
3498   }
3499 }
GetLongestProteinUnindexed(BioseqPtr bsp)3500 static SeqFeatPtr GetLongestProteinUnindexed (
3501   BioseqPtr bsp
3502 )
3503 {
3504   BioseqSetPtr  bssp = NULL;
3505   UndxFeatData  ufd;
3506   if (bsp == NULL) return NULL;
3507   MemSet ((Pointer) &ufd, 0, sizeof (UndxFeatData));
3508   ufd.bspid = bsp->id;
3509   ufd.longest = 0;
3510   ufd.sfp = NULL;
3511   VisitFeaturesOnBsp (bsp, (Pointer) &ufd, GetLongestProtFeat);
3512   if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
3513     bssp = (BioseqSetPtr) bsp->idx.parentptr;
3514   }
3515   if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) {
3516     VisitFeaturesOnSet (bssp, (Pointer) &ufd, GetLongestProtFeat);
3517     if (bssp->idx.parenttype == OBJ_BIOSEQSET) {
3518       bssp = (BioseqSetPtr) bssp->idx.parentptr;
3519     }
3520   }
3521   if (bssp != NULL && bssp->_class == BioseqseqSet_class_segset) {
3522     VisitFeaturesOnSet (bssp, (Pointer) &ufd, GetLongestProtFeat);
3523   }
3524   return ufd.sfp;
3525 }
GetCDSProtFeat(SeqFeatPtr sfp,Pointer userdata)3526 static void GetCDSProtFeat (
3527   SeqFeatPtr sfp,
3528   Pointer userdata
3529 )
3530 {
3531   SeqIdPtr     sip;
3532   UndxFeatPtr  ufp;
3533   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return;
3534   ufp = (UndxFeatPtr) userdata;
3535   if (ufp == NULL) return;
3536   sip = SeqLocId (sfp->product);
3537   if (sip == NULL) return;
3538   if (! SeqIdIn (sip, ufp->bspid)) return;
3539   ufp->sfp = sfp;
3540 }
GetCDSProtUnindexed(BioseqPtr bsp)3541 static SeqFeatPtr GetCDSProtUnindexed (
3542   BioseqPtr bsp
3543 )
3544 {
3545   Uint2         entityID;
3546   SeqEntryPtr   sep;
3547   UndxFeatData  ufd;
3548   if (bsp == NULL) return NULL;
3549   entityID = ObjMgrGetEntityIDForPointer (bsp);
3550   sep = GetTopSeqEntryForEntityID (entityID);
3551   if (sep == NULL) return NULL;
3552   MemSet ((Pointer) &ufd, 0, sizeof (UndxFeatData));
3553   ufd.bspid = bsp->id;
3554   ufd.sfp = NULL;
3555   VisitFeaturesInSep (sep, (Pointer) &ufd, GetCDSProtFeat);
3556   return ufd.sfp;
3557 }
GetBestGeneFeat(SeqFeatPtr sfp,Pointer userdata)3558 static void GetBestGeneFeat (
3559   SeqFeatPtr sfp,
3560   Pointer userdata
3561 )
3562 {
3563   Int4         diff;
3564   SeqIdPtr     sip;
3565   UndxFeatPtr  ufp;
3566   if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE) return;
3567   ufp = (UndxFeatPtr) userdata;
3568   if (ufp == NULL) return;
3569   sip = SeqLocId (sfp->location);
3570   if (sip == NULL) return;
3571   if (! SeqIdIn (sip, ufp->bspid)) return;
3572   diff = SeqLocAinB (ufp->loc, sfp->location);
3573   if (diff >= 0) {
3574     if (diff < ufp->shortest) {
3575       ufp->sfp = sfp;
3576       ufp->shortest = diff;
3577     }
3578   }
3579 }
GetBestGeneUnindexed(SeqLocPtr slp,Uint2 entityID)3580 static SeqFeatPtr GetBestGeneUnindexed (
3581   SeqLocPtr slp,
3582   Uint2 entityID
3583 )
3584 {
3585   BioseqPtr     bsp;
3586   SeqEntryPtr   sep;
3587   SeqIdPtr      sip;
3588   UndxFeatData  ufd;
3589   if (slp == NULL) return NULL;
3590   sip = SeqLocId (slp);
3591   if (sip == NULL) return NULL;
3592   bsp = BioseqFindCore (sip);
3593   if (bsp == NULL) return NULL;
3594   sep = GetTopSeqEntryForEntityID (entityID);
3595   if (sep == NULL) return NULL;
3596   MemSet ((Pointer) &ufd, 0, sizeof (UndxFeatData));
3597   ufd.bspid = bsp->id;
3598   ufd.loc = slp;
3599   ufd.shortest = INT4_MAX;
3600   ufd.sfp = NULL;
3601   VisitFeaturesInSep (sep, (Pointer) &ufd, GetBestGeneFeat);
3602   return ufd.sfp;
3603 }
3604 /* GatherProtCDS is still faster than GetCDSProtUnindexed for some reason */
GatherProtCDS(BioseqPtr bsp)3605 static SeqFeatPtr GatherProtCDS(BioseqPtr bsp)
3606 {
3607     GatherScope gsc;
3608     SeqLocPtr slp = NULL;
3609     Uint2 bspID;
3610     SeqFeatPtr sfp;
3611       MemSet ((Pointer) (&gsc), 0, sizeof (GatherScope));
3612     MemSet ((Pointer) (gsc.ignore), (int)(TRUE), (size_t) (OBJ_MAX * sizeof(Boolean)));
3613     gsc.ignore[OBJ_SEQFEAT] = FALSE;
3614     gsc.ignore[OBJ_SEQANNOT] = FALSE;
3615     gsc.get_feats_product = TRUE;
3616     bspID = ObjMgrGetEntityIDForPointer(bsp);
3617     slp = ValNodeNew(NULL);
3618     slp->choice = SEQLOC_WHOLE;
3619     slp->data.ptrvalue = (SeqIdPtr) SeqIdDup (SeqIdFindBest (bsp->id, 0));
3620     gsc.target = slp;
3621     sfp = NULL;
3622     GatherEntity(bspID, &sfp, GetFeatCDS, &gsc);
3623     SeqLocFree(slp);
3624     return sfp;
3625 }
3626 /* obsolete functions, replaced by Unindexed versions */
GatherSeqFeatProt(BioseqPtr bsp)3627 static SeqFeatPtr GatherSeqFeatProt(BioseqPtr bsp)
3628 {
3629     GatherScope gsc;
3630     SeqLocPtr slp = NULL;
3631     Uint2 bspID;
3632     SeqFeatPtr sfp = NULL;
3633     SeqFeatPtr f;
3634     ValNodePtr prot, v;
3635     Int4 length, longest_length=0;
3636       MemSet ((Pointer) (&gsc), 0, sizeof (GatherScope));
3637     MemSet ((Pointer) (gsc.ignore), (int)(TRUE), (size_t) (OBJ_MAX * sizeof(Boolean)));
3638     gsc.ignore[OBJ_SEQFEAT] = FALSE;
3639     gsc.ignore[OBJ_SEQANNOT] = FALSE;
3640     gsc.get_feats_location = TRUE;
3641     bspID = ObjMgrGetEntityIDForPointer(bsp);
3642     slp = ValNodeNew(NULL);
3643     slp->choice = SEQLOC_WHOLE;
3644     slp->data.ptrvalue = (SeqIdPtr) SeqIdDup (SeqIdFindBest (bsp->id, 0));
3645     gsc.target = slp;
3646     prot = NULL;
3647     GatherEntity(bspID, &prot, GetFeatProt, &gsc);
3648     for (v=prot; v; v=v->next) {
3649         f = (SeqFeatPtr) v->data.ptrvalue;
3650         if ((length=SeqLocLen(f->location)) == -1)
3651             continue;
3652         if (length > longest_length) {
3653             sfp = f;
3654             longest_length = length;
3655         }
3656     }
3657     ValNodeFree(prot);
3658     SeqLocFree(slp);
3659     return sfp;
3660 }
GatherGenesForCDS(SeqLocPtr slp)3661 static ValNodePtr GatherGenesForCDS(SeqLocPtr slp)
3662 {
3663     GatherScope gsc;
3664     Uint2 bspID;
3665     ValNodePtr vnp;
3666     BioseqPtr bsp;
3667     bsp = BioseqFindCore(SeqLocId(slp));
3668     if (bsp == NULL)
3669         return NULL;
3670     bspID = ObjMgrGetEntityIDForPointer(bsp);
3671       MemSet ((Pointer) (&gsc), 0, sizeof (GatherScope));
3672     MemSet ((Pointer) (gsc.ignore), (int)(TRUE), (size_t) (OBJ_MAX * sizeof(Boolean)));
3673     gsc.ignore[OBJ_SEQFEAT] = FALSE;
3674     gsc.ignore[OBJ_SEQANNOT] = FALSE;
3675     gsc.get_feats_location = TRUE;
3676     gsc.target = slp;
3677     vnp = NULL;
3678     GatherEntity(bspID, &vnp, GetFeatGenes, &gsc);
3679     return vnp;
3680 }
3681 typedef struct nmdef {
3682   SeqFeatPtr  gene;
3683   SeqFeatPtr  cds;
3684   SeqFeatPtr  prot;
3685   Int4        protlen;
3686   Int2        numgenes;
3687   Int2        numcds;
3688   Int2        numprots;
3689 } NMDef, PNTR NMDefPtr;
FindNMFeats(SeqFeatPtr sfp,Pointer userdata)3690 static void FindNMFeats (SeqFeatPtr sfp, Pointer userdata)
3691 {
3692   Int4      len;
3693   NMDefPtr  ndp;
3694   if (sfp == NULL) return;
3695   ndp = (NMDefPtr) userdata;
3696   if (ndp == NULL) return;
3697   switch (sfp->data.choice) {
3698     case SEQFEAT_GENE :
3699       ndp->gene = sfp;
3700       (ndp->numgenes)++;
3701       break;
3702     case SEQFEAT_CDREGION :
3703       ndp->cds = sfp;
3704       (ndp->numcds++);
3705       break;
3706     case SEQFEAT_PROT :
3707       len = SeqLocLen (sfp->location);
3708       if (len > ndp->protlen) {
3709         ndp->prot = sfp;
3710         ndp->protlen = len;
3711         (ndp->numprots)++;
3712       }
3713       break;
3714     default :
3715       break;
3716   }
3717 }
IsFlyCG(CharPtr str)3718 static Boolean IsFlyCG (CharPtr str)
3719 {
3720   Char  ch;
3721   if (StringHasNoText (str)) return FALSE;
3722   ch = *str;
3723   if (ch != 'C') return FALSE;
3724   str++;
3725   ch = *str;
3726   if (ch != 'G') return FALSE;
3727   str++;
3728   ch = *str;
3729   while (IS_DIGIT (ch)) {
3730     str++;
3731     ch = *str;
3732   }
3733   if (ch != '-') return FALSE;
3734   str++;
3735   ch = *str;
3736   if (ch != 'P') return FALSE;
3737   str++;
3738   ch = *str;
3739   if (IS_ALPHA (ch)) {
3740     str++;
3741     ch = *str;
3742     if (ch == '\0' || ch == ' ' || ch == ',' || ch == ';') return TRUE;
3743   }
3744   return FALSE;
3745 }
ReplaceFlyDashPwithDashR(CharPtr str)3746 static void ReplaceFlyDashPwithDashR (CharPtr str)
3747 {
3748   Char     ch;
3749   CharPtr  ptr;
3750   while (StringDoesHaveText (str)) {
3751     ch = *str;
3752     while (IS_WHITESP (ch)) {
3753       str++;
3754       ch = *str;
3755     }
3756     if (IsFlyCG (str)) {
3757       ptr = StringStr (str, "-P");
3758       if (ptr != NULL) {
3759         ptr [1] = 'R';
3760         return;
3761       }
3762     }
3763     while (ch != '\0' && (! IS_WHITESP (ch))) {
3764       str++;
3765       ch = *str;
3766     }
3767   }
3768 }
FindNMDefLine(BioseqPtr bsp)3769 static CharPtr FindNMDefLine (BioseqPtr bsp)
3770 {
3771   BioSourcePtr  biop;
3772   Char          buf [512], buf2 [600];
3773   CharPtr       cds = NULL;
3774   Uint2         entityID;
3775   CharPtr       gene;
3776   Boolean       is_refseq = FALSE;
3777   size_t        len;
3778   NMDef         nd;
3779   OrgRefPtr     orp;
3780   CharPtr       ptr;
3781   SeqEntryPtr   sep;
3782   SeqIdPtr      sip;
3783   CharPtr       str;
3784   ValNodePtr    vnp;
3785   MemSet ((Pointer) &nd, 0, sizeof (NMDef));
3786   entityID = ObjMgrGetEntityIDForPointer (bsp);
3787   sep = GetBestTopParentForDataEx (entityID, bsp, TRUE);
3788   VisitFeaturesInSep (sep, (Pointer) &nd, FindNMFeats);
3789   if (nd.numgenes != 1 || nd.numcds != 1 || nd.numprots < 1) return NULL;
3790   vnp = GatherDescrOnBioseq (NULL, bsp, Seq_descr_source, FALSE);
3791   if (vnp == NULL) return NULL;
3792   biop = (BioSourcePtr) vnp->data.ptrvalue;
3793   orp = biop->org;
3794   if (orp == NULL || StringHasNoText (orp->taxname)) return NULL;
3795   FeatDefLabel (nd.gene, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
3796   gene = StringSaveNoNull (buf);
3797   FeatDefLabel (nd.cds, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
3798   for (sip = bsp->id; sip != NULL; sip = sip->next) {
3799     if (sip->choice == SEQID_OTHER) {
3800       is_refseq = TRUE;
3801     }
3802   }
3803   if (is_refseq) {
3804     /* special case Drosophila RefSeq NM titles */
3805     if (StringICmp (orp->taxname, "Drosophila melanogaster") == 0) {
3806       ReplaceFlyDashPwithDashR (buf);
3807     }
3808     ptr = StringStr (buf, "isoform ");
3809     if (ptr != NULL) {
3810       *ptr = '\0';
3811       ptr += 8;
3812       StringCpy (buf2, buf);
3813       StringCat (buf2, "transcript variant ");
3814       StringCat (buf2, ptr);
3815       cds = StringSaveNoNull (buf2);
3816     } else {
3817       cds = StringSaveNoNull (buf);
3818     }
3819   } else {
3820     cds = StringSaveNoNull (buf);
3821   }
3822   len = StringLen (orp->taxname) + StringLen (cds) +
3823         StringLen (gene) + StringLen ("  (), mRNA") + 10;
3824   str = (CharPtr) MemNew (len);
3825   if (str != NULL) {
3826     sprintf (str, "%s %s (%s), mRNA", orp->taxname, cds, gene);
3827   }
3828   MemFree (gene);
3829   MemFree (cds);
3830   return str;
3831 }
FindNRDefLine(BioseqPtr bsp)3832 static CharPtr FindNRDefLine (BioseqPtr bsp)
3833 {
3834   BioSourcePtr  biop;
3835   Char          buf [512];
3836   Uint2         entityID;
3837   CharPtr       gene;
3838   size_t        len;
3839   MolInfoPtr    mip;
3840   NMDef         nd;
3841   OrgRefPtr     orp;
3842   CharPtr       rna = "miscRNA";
3843   SeqEntryPtr   sep;
3844   CharPtr       str;
3845   ValNodePtr    vnp;
3846   MemSet ((Pointer) &nd, 0, sizeof (NMDef));
3847   entityID = ObjMgrGetEntityIDForPointer (bsp);
3848   sep = GetBestTopParentForDataEx (entityID, bsp, TRUE);
3849   VisitFeaturesInSep (sep, (Pointer) &nd, FindNMFeats);
3850   if (nd.numgenes < 1) return NULL;
3851   vnp = GatherDescrOnBioseq (NULL, bsp, Seq_descr_source, FALSE);
3852   if (vnp == NULL) return NULL;
3853   biop = (BioSourcePtr) vnp->data.ptrvalue;
3854   orp = biop->org;
3855   if (orp == NULL || StringHasNoText (orp->taxname)) return NULL;
3856   FeatDefLabel (nd.gene, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
3857   gene = StringSaveNoNull (buf);
3858   vnp = GatherDescrOnBioseq (NULL, bsp, Seq_descr_molinfo,TRUE);
3859   if (vnp != NULL) {
3860     mip = (MolInfoPtr) vnp->data.ptrvalue;
3861     if (mip != NULL) {
3862       switch (mip->biomol) {
3863         case MOLECULE_TYPE_PRE_MRNA :
3864           rna = "precursorRNA";
3865           break;
3866         case MOLECULE_TYPE_MRNA :
3867           rna = "mRNA";
3868           break;
3869         case MOLECULE_TYPE_RRNA :
3870           rna = "rRNA";
3871           break;
3872         case MOLECULE_TYPE_TRNA :
3873           rna = "tRNA";
3874           break;
3875         case MOLECULE_TYPE_SNRNA :
3876           rna = "snRNA";
3877           break;
3878         case MOLECULE_TYPE_SCRNA :
3879           rna = "scRNA";
3880           break;
3881         case MOLECULE_TYPE_CRNA :
3882           rna = "cRNA";
3883           break;
3884         case MOLECULE_TYPE_SNORNA :
3885           rna = "snoRNA";
3886           break;
3887         case MOLECULE_TYPE_TRANSCRIBED_RNA :
3888           rna = "miscRNA";
3889           break;
3890         case MOLECULE_TYPE_NCRNA :
3891           rna = "ncRNA";
3892           break;
3893         case MOLECULE_TYPE_TMRNA :
3894           rna = "tmRNA";
3895           break;
3896         default :
3897           break;
3898       }
3899     }
3900   }
3901   len = StringLen (orp->taxname) + StringLen (gene) +
3902         StringLen (", ") + 30;
3903   str = (CharPtr) MemNew (len);
3904   if (str != NULL) {
3905     sprintf (str, "%s %s, %s", orp->taxname, gene, rna);
3906   }
3907   MemFree (gene);
3908   return str;
3909 }
3910 
TrimPunctuationFromEnd(CharPtr str)3911 static CharPtr TrimPunctuationFromEnd (CharPtr str)
3912 
3913 {
3914   Uchar    ch;      /* to use 8bit characters in multibyte languages */
3915   CharPtr  dst;
3916   CharPtr  ptr;
3917 
3918   if (str != NULL && str [0] != '\0') {
3919     dst = NULL;
3920     ptr = str;
3921     ch = *ptr;
3922     while (ch != '\0') {
3923       if (ch == ' ' || ch == ';' || ch == ',' || ch == '~' || ch == '.') {
3924         if (dst == NULL) {
3925           dst = ptr;
3926         }
3927       } else  {
3928         dst = NULL;
3929       }
3930       ptr++;
3931       ch = *ptr;
3932     }
3933     if (dst != NULL) {
3934       *dst = '\0';
3935     }
3936   }
3937   return str;
3938 }
3939 
TrimNonPeriodPunctuationFromEnd(CharPtr str)3940 static CharPtr TrimNonPeriodPunctuationFromEnd (CharPtr str)
3941 
3942 {
3943   Uchar    ch;      /* to use 8bit characters in multibyte languages */
3944   CharPtr  dst;
3945   CharPtr  ptr;
3946 
3947   if (str != NULL && str [0] != '\0') {
3948     dst = NULL;
3949     ptr = str;
3950     ch = *ptr;
3951     while (ch != '\0') {
3952       if (ch == ' ' || ch == ';' || ch == ',' || ch == '~') {
3953         if (dst == NULL) {
3954           dst = ptr;
3955         }
3956       } else  {
3957         dst = NULL;
3958       }
3959       ptr++;
3960       ch = *ptr;
3961     }
3962     if (dst != NULL) {
3963       *dst = '\0';
3964     }
3965   }
3966   return str;
3967 }
3968 
FindProtDefLine(BioseqPtr bsp,Boolean extProtTitle)3969 static CharPtr FindProtDefLine(BioseqPtr bsp, Boolean extProtTitle)
3970 {
3971     SeqFeatPtr sfp = NULL /* , f */;
3972     ProtRefPtr prp;
3973     SeqFeatXrefPtr xref;
3974     GeneRefPtr grp=NULL;
3975     ValNodePtr vnp, /* v, */ syn;
3976     SeqLocPtr loc;
3977     CharPtr title = NULL, s, geneprod;
3978     /*
3979     Int4 diff_lowest = INT4_MAX, diff_current;
3980     */
3981     Int2 length = 0;
3982     SeqFeatPtr best_gene = NULL;
3983     Uint2 entityID;
3984     Boolean indexed;
3985     if (bsp == NULL) {
3986         return NULL;
3987     }
3988     entityID = ObjMgrGetEntityIDForPointer (bsp);
3989     indexed = (Boolean)SeqMgrFeaturesAreIndexed (entityID);
3990     sfp = NULL;
3991     if (indexed) {
3992         sfp = SeqMgrGetBestProteinFeature (bsp, NULL);
3993     } else {
3994         sfp = GetLongestProteinUnindexed (bsp);
3995         /*
3996         if (sfp == NULL) {
3997             sfp = GatherSeqFeatProt(bsp);
3998         }
3999         */
4000     }
4001     if (sfp != NULL) {
4002         prp = (ProtRefPtr) sfp->data.value.ptrvalue;
4003         if (prp && prp->name) {
4004             for (vnp=prp->name; vnp; vnp=vnp->next) {
4005                 length += StringLen((CharPtr)vnp->data.ptrvalue) + 2;
4006             }
4007             s = title = (CharPtr) MemNew(length + 1);
4008             if (prp->name->data.ptrvalue) {
4009                 sprintf(title, "%s",
4010                                         (CharPtr) prp->name->data.ptrvalue);
4011             }
4012             s += StringLen(title);
4013             if (extProtTitle) {
4014                 for (vnp=prp->name->next; vnp; vnp=vnp->next) {
4015                     sprintf(s, "; %s",
4016                                             (CharPtr) vnp->data.ptrvalue);
4017                     s += StringLen((CharPtr)vnp->data.ptrvalue) + 2;
4018                 }
4019             }
4020             TrimPunctuationFromEnd (title);
4021             /* if hypothetical protein, append locus_tag */
4022             if (StringICmp (title, "hypothetical protein") == 0) {
4023                 sfp = NULL;
4024                 if (indexed) {
4025                     sfp = SeqMgrGetCDSgivenProduct (bsp, NULL);
4026                 } else {
4027                     /*
4028                     sfp = GetCDSProtUnindexed (bsp);
4029                     */
4030                     sfp = GatherProtCDS(bsp);
4031                 }
4032                 if (sfp != NULL) {
4033                     grp = SeqMgrGetGeneXref (sfp);
4034                     if (grp == NULL) {
4035                         loc = sfp->location;
4036                         best_gene = NULL;
4037                         if (indexed) {
4038                             best_gene = SeqMgrGetOverlappingGene (loc, NULL);
4039                         } else {
4040                             best_gene = GetBestGeneUnindexed (loc, entityID);
4041                             /*
4042                             vnp = GatherGenesForCDS(loc);
4043                             for (v=vnp; v; v=v->next) {
4044                                 f = (SeqFeatPtr) v->data.ptrvalue;
4045                                 diff_current = SeqLocAinB(loc, f->location);
4046                                 if (! diff_current) {
4047                                     best_gene = f;
4048                                     break;
4049                                 } else if (diff_current > 0) {
4050                                     if ((diff_lowest == -1) || (diff_current<diff_lowest)) {
4051                                         diff_lowest = diff_current;
4052                                         best_gene = f;
4053                                     }
4054                                 }
4055                             }
4056                             ValNodeFree(vnp);
4057                             */
4058                         }
4059                         if (best_gene != NULL) {
4060                             grp = (GeneRefPtr) best_gene->data.value.ptrvalue;
4061                         }
4062                     }
4063                 }
4064                 if (grp != NULL) {
4065                     geneprod = NULL;
4066                     if (grp->locus_tag != NULL) {
4067                         geneprod = grp->locus_tag;
4068                     }
4069                     if (geneprod != NULL) {
4070                         s = (CharPtr) MemNew (StringLen (geneprod) + StringLen (title) + 20);
4071                         if (s != NULL) {
4072                             sprintf (s, "%s %s", title, geneprod);
4073                             MemFree (title);
4074                             title = s;
4075                         }
4076                     }
4077                 }
4078             }
4079         } else if (prp && prp->desc) {
4080             title = StringSave(prp->desc);
4081         } else if (prp && prp->activity) {
4082             if (prp->activity->data.ptrvalue) {
4083                 title = StringSave (prp->activity->data.ptrvalue);
4084             }
4085         }
4086     }
4087     if (title == NULL) {
4088         sfp = NULL;
4089         if (indexed) {
4090             sfp = SeqMgrGetCDSgivenProduct (bsp, NULL);
4091         } else {
4092             /*
4093             sfp = GetCDSProtUnindexed (bsp);
4094             */
4095             sfp = GatherProtCDS(bsp);
4096         }
4097         if (sfp != NULL) {
4098             loc = sfp->location;
4099             for (xref = sfp->xref; xref; xref=xref->next) {
4100                 if (xref->data.choice == SEQFEAT_GENE) {
4101                     grp = (GeneRefPtr) xref->data.value.ptrvalue;
4102                 }
4103             }
4104             if (grp) {
4105                 geneprod = NULL;
4106                 if (grp->locus != NULL) {
4107                     geneprod = grp->locus;
4108                 } else if (grp->syn != NULL) {
4109                     syn = grp->syn;
4110                     geneprod = (CharPtr) syn->data.ptrvalue;
4111                 } else if (grp->desc != NULL) {
4112                     geneprod = (CharPtr) grp->desc;
4113                 }
4114                 if (geneprod != NULL) {
4115                     s = (CharPtr) MemNew(StringLen(geneprod) + 15);
4116                     sprintf(s, "%s gene product", geneprod);
4117                     title = s;
4118                 }
4119             }
4120             if (title == NULL) {
4121                 best_gene = NULL;
4122                 if (indexed) {
4123                     best_gene = SeqMgrGetOverlappingGene (loc, NULL);
4124                 } else {
4125                     best_gene = GetBestGeneUnindexed (loc, entityID);
4126                     /*
4127                     vnp = GatherGenesForCDS(loc);
4128                     for (v=vnp; v; v=v->next) {
4129                         f = (SeqFeatPtr) v->data.ptrvalue;
4130                         diff_current = SeqLocAinB(loc, f->location);
4131                         if (! diff_current) {
4132                             best_gene = f;
4133                             break;
4134                         } else if (diff_current > 0) {
4135                             if ((diff_lowest == -1) || (diff_current<diff_lowest)) {
4136                                 diff_lowest = diff_current;
4137                                 best_gene = f;
4138                             }
4139                         }
4140                     }
4141                     ValNodeFree(vnp);
4142                     */
4143                 }
4144                 if (best_gene != NULL) {
4145                     grp = (GeneRefPtr) best_gene->data.value.ptrvalue;
4146                     if (grp) {
4147                         geneprod = NULL;
4148                         if (grp->locus != NULL) {
4149                             geneprod = grp->locus;
4150                         } else if (grp->syn != NULL) {
4151                             syn = grp->syn;
4152                             geneprod = (CharPtr) syn->data.ptrvalue;
4153                         } else if (grp->desc != NULL) {
4154                             geneprod = (CharPtr) grp->desc;
4155                         }
4156                         if (geneprod != NULL) {
4157                             s = (CharPtr) MemNew(StringLen(geneprod) + 15);
4158                             sprintf(s, "%s gene product", geneprod);
4159                             title = s;
4160                         }
4161                     }
4162                 }
4163             }
4164         }
4165     }
4166     if (title != NULL) {
4167       TrimPunctuationFromEnd (title);
4168     }
4169     if (title == NULL) {
4170       title = StringSave ("unnamed protein product");
4171     }
4172     return title;
4173 }
StrainNotAtEndOfTaxname(CharPtr name,CharPtr strain)4174 static Boolean StrainNotAtEndOfTaxname (CharPtr name, CharPtr strain)
4175 {
4176   size_t   len;
4177   CharPtr  ptr;
4178   char ch;
4179 
4180   if (StringHasNoText (name) || StringHasNoText (strain)) return TRUE;
4181   ptr = StringChr (name, ' ');
4182   if (ptr == NULL) return TRUE;
4183   ptr++;
4184   ptr = StringChr (ptr, ' ');
4185   if (ptr == NULL) return TRUE;
4186   ptr++;
4187   ptr = StringISearch (ptr, strain);
4188   if (ptr == NULL) return TRUE;
4189   len = StringLen (strain);
4190   ptr += len;
4191   if (! StringHasNoText (ptr)) {
4192     if (StringCmp (ptr, "'") == 0) {
4193       ptr -= len + 1;
4194       if (*ptr == '\'') return FALSE;
4195     }
4196     return TRUE;
4197   }
4198   ptr -= len + 1;
4199   ch = *ptr;
4200   if (ch != ' ' && ch != '-' && ch != ':' && ch != ';' && ch != '.') return TRUE;
4201   return FALSE;
4202 }
GetNumClones(CharPtr str)4203 static Int2 GetNumClones (CharPtr str)
4204 {
4205   Char  ch;
4206   Int2  count;
4207   if (StringHasNoText (str)) return 0;
4208   count = 1;
4209   ch = *str;
4210   while (ch != '\0') {
4211     if (ch == ';') {
4212       count++;
4213     }
4214     str++;
4215     ch = *str;
4216   }
4217   return count;
4218 }
SimpleSegSeqTitle(BioseqPtr bsp)4219 static CharPtr SimpleSegSeqTitle (BioseqPtr bsp)
4220 {
4221   BioSourcePtr       biop;
4222   SeqMgrFeatContext  ccontext;
4223   SeqFeatPtr         cds;
4224   CharPtr            clone = NULL;
4225   CharPtr            complete = "gene, complete cds";
4226   SeqMgrDescContext  dcontext;
4227   SeqMgrFeatContext  gcontext;
4228   SeqFeatPtr         gene;
4229   GeneRefPtr         grp;
4230   CharPtr            isolate = NULL;
4231   CharPtr            label = NULL;
4232   size_t             len;
4233   CharPtr            locus = NULL;
4234   OrgModPtr          mod;
4235   CharPtr            modifier = NULL;
4236   Int2               numclones;
4237   ObjMgrDataPtr      omdp;
4238   ObjMgrPtr          omp;
4239   OrgNamePtr         onp;
4240   OrgRefPtr          orp;
4241   CharPtr            organism = NULL;
4242   CharPtr            product = NULL;
4243   SeqDescrPtr        sdp;
4244   SubSourcePtr       ssp;
4245   CharPtr            str;
4246   CharPtr            strain = NULL;
4247   CharPtr            title;
4248   ValNodePtr         vnp;
4249   Uint2              entityID;
4250 
4251   if (bsp == NULL) return NULL;
4252   /* check to see if feature indexing has been called */
4253   omdp = (ObjMgrDataPtr) bsp->omdp;
4254   if (omdp == NULL) return NULL;
4255   omp = ObjMgrReadLock ();
4256   omdp = ObjMgrFindTop (omp, omdp);
4257   ObjMgrUnlock ();
4258   if (omdp == NULL) return NULL;
4259   /*
4260   if (omdp->indexed == 0) return NULL;
4261   */
4262 
4263   entityID = ObjMgrGetEntityIDForPointer (bsp);
4264   if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
4265     SeqMgrIndexFeatures (entityID, NULL);
4266   }
4267 
4268   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
4269   if (sdp == NULL) return NULL;
4270   biop = (BioSourcePtr) sdp->data.ptrvalue;
4271   if (biop == NULL) return NULL;
4272   orp = biop->org;
4273   if (orp != NULL && (! StringHasNoText (orp->taxname))) {
4274     organism = orp->taxname;
4275     onp = orp->orgname;
4276     if (onp != NULL) {
4277       mod = onp->mod;
4278       if (mod != NULL) {
4279         if (mod->subtype == ORGMOD_strain) {
4280           if (mod->subname != NULL && StrainNotAtEndOfTaxname (organism, mod->subname)) {
4281             strain = (CharPtr) mod->subname;
4282           }
4283         } else if (mod->subtype == ORGMOD_isolate) {
4284           isolate = (CharPtr) mod->subname;
4285         }
4286       }
4287     }
4288   } else {
4289     organism = "Unknown";
4290   }
4291   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
4292     if (ssp->subtype == SUBSRC_clone) {
4293       if (ssp->name != NULL) {
4294         numclones = GetNumClones (ssp->name);
4295         if (numclones < 4) {
4296           clone = (CharPtr) ssp->name;
4297         }
4298       }
4299     }
4300   }
4301   cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext);
4302   if (cds != NULL) {
4303     if (cds->partial) {
4304       complete = "gene, partial cds";
4305     }
4306     product = ccontext.label;
4307     grp = SeqMgrGetGeneXref (cds);
4308     if (grp != NULL) {
4309       if (! StringHasNoText (grp->locus)) {
4310         locus = grp->locus;
4311       } else {
4312         vnp = grp->syn;
4313         if (vnp != NULL) {
4314           str = (CharPtr) vnp->data.ptrvalue;
4315           if (! StringHasNoText (str)) {
4316             locus = str;
4317           }
4318         }
4319       }
4320     }
4321     if (locus == NULL) {
4322       gene = SeqMgrGetOverlappingGene (cds->location, &gcontext);
4323       if (gene != NULL) {
4324         locus = gcontext.label;
4325       }
4326     }
4327   } else {
4328     if (StringDoesHaveText (strain)) {
4329       modifier = strain;
4330       label = "strain";
4331     } else if (StringDoesHaveText (clone)) {
4332       modifier = clone;
4333       label = "clone";
4334     } else if (StringDoesHaveText (isolate)) {
4335       modifier = isolate;
4336       label = "isolate";
4337     }
4338   }
4339   len = StringLen (organism) + StringLen (label) + StringLen (modifier) +
4340         StringLen (product) + StringLen (locus) + StringLen (complete);
4341   title = (CharPtr) MemNew (len + 10);
4342   if (organism != NULL) {
4343     StringCat (title, organism);
4344   }
4345   if (modifier != NULL) {
4346     StringCat (title, " ");
4347     StringCat (title, label);
4348     StringCat (title, " ");
4349     StringCat (title, modifier);
4350   }
4351   if (product != NULL) {
4352     StringCat (title, " ");
4353     StringCat (title, product);
4354   }
4355   if (locus != NULL) {
4356     StringCat (title, " (");
4357     StringCat (title, locus);
4358     StringCat (title, ")");
4359   }
4360   if (product != NULL || locus != NULL) {
4361     StringCat (title, " ");
4362     StringCat (title, complete);
4363   }
4364   TrimSpacesAroundString (title);
4365   return title;
4366 }
4367 
UseOrgMods(BioseqPtr bsp,CharPtr suffix,Uint1 tech,Boolean htgs_pooled_multiclone)4368 static CharPtr UseOrgMods(BioseqPtr bsp, CharPtr suffix, Uint1 tech, Boolean htgs_pooled_multiclone)
4369 {
4370     ItemInfoPtr         iip = NULL;
4371     ValNodePtr             vnp;
4372     BioSourcePtr         biop;
4373     OrgModPtr       mod;
4374     OrgNamePtr      onp;
4375     OrgRefPtr       orp;
4376     SubSourcePtr       ssp;
4377     Char            ch;
4378     CharPtr                  name = NULL, chr = NULL, str = NULL,
4379                               cln = NULL, map = NULL, pls = NULL, def = NULL, ptr;
4380     Int2                     deflen = 0;
4381     Int2            numclones;
4382     if (bsp == NULL) {
4383         return NULL;
4384     }
4385     if ((vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_source,FALSE)) == NULL) {
4386         return NULL;
4387     }
4388     biop = (BioSourcePtr) vnp->data.ptrvalue;
4389   orp = biop->org;
4390   if (orp && orp->taxname) {
4391     name = StringSave(orp->taxname);
4392     deflen += StringLen(orp->taxname);
4393   }
4394     for (ssp = biop->subtype; ssp; ssp=ssp->next) {
4395         if (ssp->subtype == SUBSRC_chromosome) { /* chromosome */
4396             if (ssp->name != NULL) {
4397                 chr = (CharPtr) MemNew(StringLen(ssp->name) + 13);
4398                 deflen += StringLen(ssp->name) + 13;
4399                 sprintf(chr, " chromosome %s", ssp->name);
4400             }
4401         } else if (ssp->subtype == SUBSRC_clone) { /* clone */
4402             if (ssp->name != NULL) {
4403                 numclones = GetNumClones (ssp->name);
4404                 if (htgs_pooled_multiclone) {
4405                     cln = (CharPtr) MemNew (30);
4406                     sprintf (cln, ", pooled multiple clones");
4407                     deflen += StringLen (cln) + 2;
4408                 } else if (numclones > 3) {
4409                     cln = (CharPtr) MemNew (20);
4410                     sprintf (cln, ", %d clones", (int) numclones);
4411                     deflen += StringLen (cln) + 2;
4412                 } else {
4413                     cln = (CharPtr) MemNew(StringLen(ssp->name) + 8);
4414                     deflen += StringLen(ssp->name) + 8;
4415                     sprintf(cln, " clone %s", ssp->name);
4416                 }
4417             }
4418         } else if (ssp->subtype == SUBSRC_map) { /* map */
4419             if (ssp->name != NULL) {
4420                 map = (CharPtr) MemNew(StringLen(ssp->name) + 7);
4421                 deflen += StringLen(ssp->name) + 7;
4422                 sprintf(map, " map %s", ssp->name);
4423             }
4424         } else if (ssp->subtype == SUBSRC_plasmid_name) { /* plasmid name */
4425             if (ssp->name != NULL) {
4426                 pls = (CharPtr) MemNew(StringLen(ssp->name) + 10);
4427                 deflen += StringLen(ssp->name) + 10;
4428                 sprintf(pls, " plasmid %s", ssp->name);
4429             }
4430         }
4431     }
4432   if (orp != NULL) {
4433         onp = orp->orgname;
4434         if (onp != NULL) {
4435             for (mod = onp->mod; mod != NULL; mod = mod->next) {
4436                 if (mod->subtype != ORGMOD_strain) continue; /* strain */
4437                 if (StringDoesHaveText (str)) continue;
4438                 if (mod->subname != NULL && StrainNotAtEndOfTaxname (name, mod->subname)) {
4439                     str = (CharPtr) MemNew(StringLen(mod->subname) + 9);
4440                     deflen += StringLen(mod->subname) + 9;
4441                     sprintf(str, " strain %s", mod->subname);
4442                     ptr = StringChr (str, ';');
4443                     if (ptr != NULL) {
4444                       *ptr = '\0';
4445                     }
4446                     TrimNonPeriodPunctuationFromEnd (str);
4447                 }
4448             }
4449         }
4450     }
4451     deflen += StringLen (suffix) + 2;
4452     def = (CharPtr) MemNew(deflen+1);
4453     if (def == NULL) return NULL;
4454     if (name) {
4455         def = StringCat(def, name);
4456         MemFree(name);
4457     }
4458     if (str) {
4459         def = StringCat(def, str);
4460         MemFree(str);
4461     }
4462     if (chr) {
4463         def = StringCat(def, chr);
4464         MemFree(chr);
4465     }
4466     if (cln) {
4467         def = StringCat(def, cln);
4468         MemFree(cln);
4469     }
4470     if (map) {
4471         def = StringCat(def, map);
4472         MemFree(map);
4473     }
4474   if (pls) {
4475     if (tech == MI_TECH_wgs) {
4476           def = StringCat(def, pls);
4477     }
4478         MemFree(pls);
4479   }
4480     if (suffix) {
4481         def = StringCat(def, " ");
4482         def = StringCat(def, suffix);
4483     }
4484     TrimSpacesAroundString (def);
4485     ch = def [0];
4486     def [0] = TO_UPPER (ch);
4487     return def;
4488 }
4489 
4490 /*
4491    The following lists need endogenous virus, hydrogenosome, chromosome, and chromatophore
4492 */
4493 
4494 static CharPtr organelleByItself [] = {
4495   NULL,
4496   NULL,
4497   "chloroplast",
4498   "chromoplast",
4499   "kinetoplast",
4500   "mitochondrion",
4501   "plastid",
4502   "macronuclear",
4503   "extrachromosomal",
4504   "plasmid",
4505   NULL,
4506   NULL,
4507   "cyanelle",
4508   "provirus",
4509   "virus",
4510   "nucleomorph",
4511   "apicoplast",
4512   "leucoplast",
4513   "protoplast",
4514   NULL,
4515   NULL,
4516   NULL,
4517   NULL
4518 };
4519 static CharPtr organelleWithPlasmid [] = {
4520   NULL,
4521   NULL,
4522   "chloroplast",
4523   "chromoplast",
4524   "kinetoplast",
4525   "mitochondrial",
4526   "plastid",
4527   "macronuclear",
4528   "extrachrom",
4529   "plasmid",
4530   NULL,
4531   NULL,
4532   "cyanelle",
4533   "proviral",
4534   "virus",
4535   "nucleomorph",
4536   "apicoplast",
4537   "leucoplast",
4538   "protoplast",
4539   NULL,
4540   NULL,
4541   NULL,
4542   NULL
4543 };
4544 static CharPtr organelleForWGS [] = {
4545   NULL,
4546   NULL,
4547   "chloroplast",
4548   "chromoplast",
4549   "kinetoplast",
4550   "mitochondrial",
4551   "plastid",
4552   "",
4553   "",
4554   "",
4555   "",
4556   "",
4557   "cyanelle",
4558   "proviral",
4559   "virus",
4560   "",
4561   "apicoplast",
4562   "leucoplast",
4563   "proplastid",
4564   "endogenous virus",
4565   "hydrogenosome",
4566   "chromosome",
4567   "chromatophore"
4568 };
4569 
4570 const Int4 kNumWGSOrganelles = sizeof (organelleForWGS) / sizeof (CharPtr);
4571 
4572 
LowercasePlasmidOrElement(CharPtr def)4573 static void LowercasePlasmidOrElement (CharPtr def)
4574 {
4575   CharPtr  ptr;
4576   if (StringHasNoText (def)) return;
4577   def++;
4578   ptr = StringISearch (def, "plasmid");
4579   while (ptr != NULL) {
4580     if (*ptr == 'P') {
4581       *ptr = 'p';
4582     }
4583     ptr = StringISearch (ptr + 7, "plasmid");
4584   }
4585   ptr = StringISearch (def, "element");
4586   while (ptr != NULL) {
4587     if (*ptr == 'E') {
4588       *ptr = 'e';
4589     }
4590     ptr = StringISearch (ptr + 7, "element");
4591   }
4592 }
4593 
4594 
MakeCompleteChromTitle(BioseqPtr bsp,Uint1 biomol,Uint1 completeness)4595 NLM_EXTERN CharPtr MakeCompleteChromTitle (BioseqPtr bsp, Uint1 biomol, Uint1 completeness)
4596 {
4597     CharPtr       completeseq = ", complete sequence";
4598     CharPtr       completegen = ", complete genome";
4599     ItemInfoPtr   iip = NULL;
4600     ValNodePtr    vnp;
4601     BioSourcePtr  biop;
4602     OrgRefPtr     orp;
4603     SubSourcePtr  ssp;
4604     CharPtr       name = NULL, chr = NULL, orgnl = NULL,
4605                   seg = NULL, pls = NULL, def = NULL;
4606     Int2          deflen = 80; /* starts with space for all fixed text */
4607     Char          ch;
4608     Boolean       plasmid;
4609     Uint1         genome;
4610     if (bsp == NULL) {
4611         return NULL;
4612     }
4613     if ((vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_source,TRUE)) == NULL) {
4614         return NULL;
4615     }
4616     biop = (BioSourcePtr) vnp->data.ptrvalue;
4617     if (biop == NULL) {
4618         return NULL;
4619     }
4620         orp = biop->org;
4621         if (orp == NULL || orp->taxname == NULL) {
4622           return NULL;
4623         }
4624     name = orp->taxname;
4625     deflen += StringLen(orp->taxname);
4626     genome = biop->genome;
4627     plasmid = (Boolean) (biop->genome == GENOME_plasmid);
4628     for (ssp = biop->subtype; ssp; ssp=ssp->next) {
4629         if (ssp->subtype == SUBSRC_chromosome) {
4630             if (ssp->name != NULL) {
4631                 chr = ssp->name;
4632                 deflen += StringLen(ssp->name);
4633             }
4634         } else if (ssp->subtype == SUBSRC_segment) {
4635             if (ssp->name != NULL) {
4636                 seg = ssp->name;
4637                 deflen += StringLen(ssp->name);
4638             }
4639         } else if (ssp->subtype == SUBSRC_plasmid_name) {
4640             if (ssp->name != NULL) {
4641                 pls = ssp->name;
4642                 deflen += StringLen(ssp->name);
4643             }
4644         }
4645     }
4646     if (genome < kNumWGSOrganelles) {
4647         if (pls != NULL) {
4648             orgnl = organelleWithPlasmid [genome];
4649         } else {
4650             orgnl = organelleByItself [genome];
4651         }
4652         if (StringISearch (name, "virus") != NULL || StringISearch (name, "phage") != NULL) {
4653             if (genome == GENOME_proviral || genome == GENOME_virion) {
4654                 orgnl = NULL;
4655             }
4656         }
4657     }
4658     if (completeness == 2 ||
4659         completeness == 3 ||
4660         completeness == 4 ||
4661         completeness == 5) {
4662         /* remove "complete" component */
4663         completeseq = ", partial sequence";
4664         completegen = ", genome";
4665     }
4666     def = (CharPtr) MemNew(deflen+1);
4667     if (StringISearch (name, "plasmid") != NULL) {
4668         StringCat(def, name);
4669         StringCat (def, completeseq);
4670         ch = *def;
4671         *def = TO_UPPER (ch);
4672         LowercasePlasmidOrElement (def);
4673         return def;
4674     } else if (plasmid) {
4675         if (name && (! pls)) {
4676             StringCat (def, name);
4677             StringCat (def, " unnamed plasmid");
4678             StringCat (def, completeseq);
4679             ch = *def;
4680             *def = TO_UPPER (ch);
4681             return def;
4682         }
4683         if (pls) {
4684             if (name) {
4685                 StringCat (def, name);
4686                 StringCat (def, " ");
4687             }
4688             if (StringISearch (pls, "plasmid") == NULL && StringISearch (pls, "element") == NULL) {
4689                 StringCat(def, "plasmid ");
4690             }
4691             StringCat (def, pls);
4692             StringCat (def, completeseq);
4693             ch = *def;
4694             *def = TO_UPPER (ch);
4695             LowercasePlasmidOrElement (def);
4696             return def;
4697         }
4698     } else if (pls) {
4699         if (name) {
4700              StringCat (def, name);
4701             StringCat (def, " ");
4702         }
4703         if (orgnl != NULL) {
4704             StringCat (def, orgnl);
4705             StringCat (def, " ");
4706         }
4707         if (StringISearch (pls, "plasmid") == NULL && StringISearch (pls, "element") == NULL) {
4708             StringCat (def, "plasmid ");
4709         }
4710         StringCat (def, pls);
4711         StringCat (def, completeseq);
4712         ch = *def;
4713         *def = TO_UPPER (ch);
4714         LowercasePlasmidOrElement (def);
4715         return def;
4716     } else if (name) {
4717         StringCat (def, name);
4718     }
4719     if (orgnl != NULL) {
4720         if (chr != NULL) {
4721             StringCat (def, " ");
4722             StringCat (def, orgnl);
4723             StringCat (def, " chromosome ");
4724             StringCat(def, chr);
4725             StringCat (def, completeseq);
4726             ch = *def;
4727             *def = TO_UPPER (ch);
4728             return def;
4729         }
4730         StringCat (def, " ");
4731         StringCat (def, orgnl);
4732         StringCat (def, completegen);
4733         ch = *def;
4734         *def = TO_UPPER (ch);
4735         return def;
4736     }
4737     if (seg != NULL) {
4738         StringCat (def, " ");
4739         if (StringStr (seg, "DNA") == NULL &&
4740             StringStr (seg, "RNA") == NULL &&
4741             StringStr (seg, "segment") == NULL &&
4742             StringStr (seg, "Segment") == NULL) {
4743           StringCat (def, "segment ");
4744         }
4745         StringCat(def, seg);
4746         StringCat (def, completeseq);
4747         ch = *def;
4748         *def = TO_UPPER (ch);
4749         return def;
4750     }
4751     if (chr != NULL) {
4752         StringCat (def, " chromosome ");
4753         StringCat(def, chr);
4754         StringCat (def, completeseq);
4755         ch = *def;
4756         *def = TO_UPPER (ch);
4757         return def;
4758     }
4759     StringCat (def, completegen);
4760     ch = *def;
4761     *def = TO_UPPER (ch);
4762     return def;
4763 }
NotSpecialTaxName(CharPtr taxname)4764 static Boolean NotSpecialTaxName (CharPtr taxname)
4765 {
4766   if (StringHasNoText (taxname)) return TRUE;
4767   if (StringICmp (taxname, "synthetic construct") == 0) return FALSE;
4768   if (StringICmp (taxname, "artificial sequence") == 0) return FALSE;
4769   if (StringStr (taxname, "vector") != NULL) return FALSE;
4770   if (StringStr (taxname, "Vector") != NULL) return FALSE;
4771   return TRUE;
4772 }
DoTpaPrefix(CharPtr title,CharPtr PNTR ttl,CharPtr PNTR pfx,Boolean is_tpa,Boolean tpa_exp,Boolean tpa_inf,Boolean is_tsa)4773 static Boolean DoTpaPrefix (
4774   CharPtr title,
4775   CharPtr PNTR ttl,
4776   CharPtr PNTR pfx,
4777   Boolean is_tpa,
4778   Boolean tpa_exp,
4779   Boolean tpa_inf,
4780   Boolean is_tsa
4781 )
4782 {
4783   /* must be called with ttl and pfx pointing to stack variables */
4784   /* string literals declared here will persist and can be passed to calling function */
4785   *ttl = title;
4786   *pfx = NULL;
4787   if (title == NULL || *title == '\0') return FALSE;
4788   if (is_tsa) {
4789     if (StringNICmp (title, "TSA: ", 5) == 0) return FALSE;
4790     *pfx = "TSA: ";
4791     return TRUE;
4792   } else if (is_tpa) {
4793     if (tpa_exp) {
4794       if (StringNICmp (title, "TPA_exp: ", 9) == 0) return FALSE;
4795       *pfx = "TPA_exp: ";
4796       if (StringNICmp (title, "TPA: ", 5) == 0) {
4797         *ttl = title +  5;
4798       }
4799       return TRUE;
4800     } else if (tpa_inf) {
4801       if (StringNICmp (title, "TPA_inf: ", 9) == 0) return FALSE;
4802       *pfx = "TPA_inf: ";
4803       if (StringNICmp (title, "TPA: ", 5) == 0) {
4804         *ttl = title +  5;
4805       }
4806       return TRUE;
4807     } else {
4808       if (StringNICmp (title, "TPA: ", 5) == 0) return FALSE;
4809       *pfx = "TPA: ";
4810       return TRUE;
4811     }
4812   }
4813   return FALSE;
4814 }
4815 
4816 /*****************************************************************************
4817 *
4818 *   CreateDefLine(iip, bsp, buf, buflen, tech)
4819 *       Finds or makes a FASTA format defline using Gather functions
4820 *       buf should be very long if possible
4821 *       function truncates if buf not long enough
4822 *       a few deflines are longer than 255
4823 *
4824 *        ItemInfoPtr iip is used in flat file generator to keep entityId, itemId
4825 *        and itemtype
4826 *****************************************************************************/
CreateDefLineExEx(ItemInfoPtr iip,BioseqPtr bsp,CharPtr buf,Uint4 buflen,Uint1 tech,CharPtr accession,CharPtr organism,Boolean ignoreTitle,Boolean extProtTitle)4827 NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr buf, Uint4 buflen, Uint1 tech,
4828                                       CharPtr accession, CharPtr organism, Boolean ignoreTitle, Boolean extProtTitle)
4829 {
4830     ValNodePtr vnp = NULL;
4831     CharPtr tmp = NULL, title = NULL, ttl = NULL, pfx = NULL;
4832     PdbBlockPtr pbp;
4833     PatentSeqIdPtr psip;
4834     PDBSeqIdPtr    pdbip;
4835     Uint4 diff, phase, i;
4836     Boolean doit;
4837     Int4 num_segs, num_gaps;
4838     static Char tbuf[128];
4839     static CharPtr htgs[2] = {
4840         "unordered", "ordered" };
4841     static CharPtr htg_phrase[3] = {
4842         "LOW-PASS SEQUENCE SAMPLING",
4843         "WORKING DRAFT SEQUENCE",
4844         "*** SEQUENCING IN PROGRESS ***" };
4845     Boolean htg_tech = FALSE, htgs_draft = FALSE, htgs_cancelled = FALSE,
4846             htgs_pooled_multiclone = FALSE, is_nc = FALSE, is_nm = FALSE,
4847             is_nr = FALSE, is_tpa = FALSE, tpa_exp = FALSE, tpa_inf = FALSE,
4848             is_tsa = FALSE;
4849     MolInfoPtr mip;
4850     GBBlockPtr gbp = NULL;
4851     EMBLBlockPtr ebp = NULL;
4852     ValNodePtr keywords = NULL;
4853     Boolean wgsmaster = FALSE;
4854     CharPtr suffix = NULL;
4855     SeqIdPtr sip;
4856     TextSeqIdPtr tsip;
4857     DbtagPtr general = NULL, dbt;
4858     ObjectIdPtr oip;
4859     ItemInfo ii;
4860     BioSourcePtr biop = NULL;
4861     OrgRefPtr orp;
4862     CharPtr taxname = NULL;
4863     SeqMgrDescContext dcontext;
4864     SeqMgrFeatContext fcontext;
4865     SeqFeatPtr sfp, src;
4866     Uint2 entityID;
4867     Uint1 genome;
4868     CharPtr orgnl = NULL;
4869 
4870     if ((bsp == NULL) || (buf == NULL) || buflen == 0) return FALSE;
4871     /* now using GetNextDescriptorUnindexed, so need to have called AssignIDsInEntityEx */
4872     if (bsp->idx.entityID == 0) {
4873       entityID = ObjMgrGetEntityIDForPointer (bsp);
4874       if (entityID != 0) {
4875         AssignIDsInEntityEx (entityID, 0, NULL, NULL);
4876       }
4877     }
4878     entityID = bsp->idx.entityID;
4879     for (sip = bsp->id; sip != NULL; sip = sip->next) {
4880         switch (sip->choice) {
4881             case SEQID_OTHER :
4882                 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
4883                 if (tsip != NULL && tsip->accession != NULL) {
4884                     if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
4885                         is_nc = TRUE;
4886                     } else if (StringNICmp (tsip->accession, "NM_", 3) == 0) {
4887                         is_nm = TRUE;
4888                     } else if (StringNICmp (tsip->accession, "NR_", 3) == 0) {
4889                         is_nr = TRUE;
4890                     }
4891                 }
4892                 break;
4893             case SEQID_TPG :
4894             case SEQID_TPE :
4895             case SEQID_TPD :
4896                 is_tpa = TRUE;
4897                 break;
4898             case SEQID_GENERAL :
4899                 dbt = (DbtagPtr) sip->data.ptrvalue;
4900                 if (dbt != NULL && (! IsSkippableDbtag (dbt))) {
4901                   general = dbt;
4902                 }
4903                 break;
4904             case SEQID_GENBANK :
4905             case SEQID_EMBL :
4906             case SEQID_DDBJ :
4907                 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
4908                 if (tsip != NULL && tsip->accession != NULL) {
4909                     if (StringLen (tsip->accession) == 12) {
4910                         if (StringCmp (tsip->accession + 6, "000000") == 0) {
4911                             wgsmaster = TRUE;
4912                         }
4913                     } else if (StringLen (tsip->accession) == 13) {
4914                         if (StringCmp (tsip->accession + 6, "0000000") == 0) {
4915                             wgsmaster = TRUE;
4916                         }
4917                     } else if (StringLen (tsip->accession) == 14) {
4918                         if (StringCmp (tsip->accession + 6, "00000000") == 0) {
4919                             wgsmaster = TRUE;
4920                         }
4921                     }
4922                 }
4923                 break;
4924             case SEQID_GPIPE :
4925                 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
4926                 break;
4927             default :
4928                 break;
4929         }
4930     }
4931     buflen--;
4932     buf[buflen] = '\0';
4933     tbuf[0] = '\0';
4934 
4935     if (tech == 0) {
4936       vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_molinfo, NULL);
4937       if (vnp != NULL) {
4938         mip = (MolInfoPtr) vnp->data.ptrvalue;
4939         if (mip != NULL) {
4940           tech = mip->tech;
4941         }
4942       }
4943     }
4944 
4945     if (((tech >= MI_TECH_htgs_1) && (tech <= MI_TECH_htgs_3)) ||
4946         (tech == MI_TECH_htgs_0)) {
4947         htg_tech = TRUE;
4948     } else if (tech == MI_TECH_tsa) {
4949       is_tsa = TRUE;
4950     }
4951     if (iip == NULL && accession != NULL) {
4952         diff = LabelCopyExtra(buf, accession, buflen, "(", ") ");
4953         buflen -= diff;
4954         buf += diff;
4955     }
4956     diff = 0;
4957     if (htg_tech || is_tpa) {
4958         vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_genbank,TRUE);
4959         if (vnp != NULL) {
4960             gbp = (GBBlockPtr) vnp->data.ptrvalue;
4961             if (gbp != NULL) {
4962               keywords = gbp->keywords;
4963             }
4964         }
4965         vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_embl,TRUE);
4966         if (vnp != NULL) {
4967             ebp = (EMBLBlockPtr) vnp->data.ptrvalue;
4968             if (ebp != NULL) {
4969               keywords = ebp->keywords;
4970             }
4971         }
4972     }
4973     if (keywords != NULL) {
4974         for (vnp = keywords; vnp != NULL; vnp = vnp->next) {
4975             if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_DRAFT") == 0) {
4976                 htgs_draft = TRUE;
4977             } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_CANCELLED") == 0) {
4978                 htgs_cancelled = TRUE;
4979             } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_POOLED_MULTICLONE") == 0 && htg_tech) {
4980                 htgs_pooled_multiclone = TRUE;
4981             } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:experimental") == 0) {
4982                 tpa_exp = TRUE;
4983             } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:inferential") == 0) {
4984                 tpa_inf = TRUE;
4985             }
4986         }
4987     }
4988     if (! ignoreTitle)
4989           {
4990             vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_title,TRUE);
4991             if (vnp != NULL)
4992               title = StringSaveNoNull((CharPtr)vnp->data.ptrvalue);
4993               if (title != NULL) {
4994                 TrimSpacesAroundString (title);
4995                 TrimPunctuationFromEnd (title);
4996               }
4997           }
4998     if (tech == MI_TECH_htgs_0 || tech == MI_TECH_htgs_1 || tech == MI_TECH_htgs_2) {
4999         MemFree(title);  /* manufacture all HTG titles */
5000         title = NULL;
5001         if (iip != NULL) {
5002           iip->entityID = 0;
5003           iip->itemID = 0;
5004           iip->itemtype = 0;
5005         }
5006         if (title == NULL || *title == '\0') {
5007             title = UseOrgMods(bsp, NULL, tech, htgs_pooled_multiclone);
5008             organism = NULL;
5009         }
5010     } else if (tech == MI_TECH_est || tech == MI_TECH_sts || tech == MI_TECH_survey) {
5011         if (title == NULL || *title == '\0') {
5012             title = UseOrgMods(bsp, NULL, tech, FALSE);
5013             organism = NULL;
5014         }
5015     } else if (tech == MI_TECH_wgs) {
5016         if (title == NULL || *title == '\0') {
5017             if (! wgsmaster) {
5018                 if (general != NULL) {
5019                     oip = general->tag;
5020                     if (oip != NULL) {
5021                         if (! StringHasNoText (oip->str)) {
5022                             suffix = oip->str;
5023                         }
5024                     }
5025                 }
5026             }
5027             title = UseOrgMods(bsp, suffix, tech, FALSE);
5028             organism = NULL;
5029         }
5030     } else if (tech == MI_TECH_tsa) {
5031         if (title == NULL || *title == '\0') {
5032             if (general != NULL) {
5033                 oip = general->tag;
5034                 if (oip != NULL) {
5035                     if (! StringHasNoText (oip->str)) {
5036                         suffix = oip->str;
5037                     }
5038                 }
5039             }
5040             title = UseOrgMods(bsp, suffix, tech, FALSE);
5041             organism = NULL;
5042         }
5043     } else if (is_nc && title == NULL) {
5044         /* manufacture complete chromosome titles if not already present */
5045         vnp = GatherDescrOnBioseq (&ii, bsp, Seq_descr_molinfo,TRUE);
5046         if (vnp != NULL) {
5047             mip = (MolInfoPtr) vnp->data.ptrvalue;
5048             if (mip != NULL &&
5049                 (mip->biomol == MOLECULE_TYPE_GENOMIC || mip->biomol == MOLECULE_TYPE_OTHER_GENETIC_MATERIAL) /* && mip->completeness == 1 */) {
5050                 title = MakeCompleteChromTitle (bsp, mip->biomol, mip->completeness);
5051                 organism = NULL;
5052                 if (iip != NULL) {
5053                     iip->entityID = ii.entityID;
5054                     iip->itemID = ii.itemID;
5055                     iip->itemtype = ii.itemtype;
5056                 }
5057             }
5058         }
5059     } else if (is_nm && title == NULL) {
5060       title = FindNMDefLine (bsp);
5061       if (title != NULL && iip != NULL) {
5062         iip->entityID = 0;
5063         iip->itemID = 0;
5064         iip->itemtype = 0;
5065       }
5066     } else if (is_nr && title == NULL) {
5067       title = FindNRDefLine (bsp);
5068       if (title != NULL && iip != NULL) {
5069         iip->entityID = 0;
5070         iip->itemID = 0;
5071         iip->itemtype = 0;
5072       }
5073     }
5074 /* some titles may have zero length */
5075     if (title != NULL && *title != '\0') {
5076         ttl = title;
5077         pfx = NULL;
5078         if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5079             diff = LabelCopy (buf, pfx, buflen);
5080             buflen -= diff;
5081             buf += diff;
5082         }
5083         diff = LabelCopy (buf, ttl, buflen);
5084                                 /* remove trailing blanks BUT NOT periods */
5085         tmp = buf + diff - 1;   /* point at last character */
5086         while (tmp >= buf && ((*tmp <= ' ') /* || (*tmp == '.') */)) {
5087             *tmp = '\0';
5088             tmp--;
5089             diff--;
5090         }
5091     } else if ((vnp = GatherDescrOnBioseq(iip, bsp, Seq_descr_pdb,TRUE)) != NULL) {
5092         pbp = (PdbBlockPtr)(vnp->data.ptrvalue);
5093         for (vnp = bsp->id; vnp != NULL; vnp = vnp->next) {
5094             if (vnp->choice == SEQID_PDB) {
5095                 pdbip = (PDBSeqIdPtr)(vnp->data.ptrvalue);
5096                 if (pdbip && pdbip->chain > 32) {
5097                     sprintf(tbuf, "Chain %c, ", pdbip->chain);
5098                     diff = LabelCopy(buf, tbuf, buflen);
5099                     buflen -= diff;
5100                     buf += diff;
5101                     break;
5102                 }
5103             }
5104         }
5105         if (pbp && pbp->compound) {
5106             tmp = StringSave ((CharPtr)(pbp->compound->data.ptrvalue));
5107             TrimNonPeriodPunctuationFromEnd (tmp);
5108             diff = LabelCopy(buf, tmp, buflen);
5109             MemFree (tmp);
5110         }
5111     } else {
5112         for (vnp = bsp->id; vnp != NULL; vnp = vnp->next) {
5113             if (vnp->choice == SEQID_PATENT)
5114             {
5115                 psip = (PatentSeqIdPtr)(vnp->data.ptrvalue);
5116                 if (psip) {
5117                     sprintf(tbuf, "Sequence %d from Patent %s %s",
5118                     (int)psip->seqid, psip->cit->country, psip->cit->number);
5119                     diff = LabelCopy(buf, tbuf, buflen);
5120                     break;
5121                 }
5122             }
5123         }
5124         if (vnp == NULL) {
5125             if (ISA_aa(bsp->mol)) {
5126                 title = FindProtDefLine(bsp, extProtTitle);
5127                 vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
5128                 if (vnp != NULL && organism == NULL) {
5129                     biop = (BioSourcePtr) vnp->data.ptrvalue;
5130                     if (biop != NULL) {
5131                         orp = biop->org;
5132                         if (orp != NULL) {
5133                             taxname = orp->taxname;
5134                         }
5135                     }
5136                     if (taxname == NULL || NotSpecialTaxName (taxname)) {
5137                         if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
5138                             SeqMgrIndexFeatures (entityID, NULL);
5139                         }
5140                         sfp = SeqMgrGetCDSgivenProduct (bsp, NULL);
5141                         if (sfp != NULL) {
5142                             src = SeqMgrGetOverlappingSource (sfp->location, &fcontext);
5143                             if (src != NULL) {
5144                                 biop = (BioSourcePtr) src->data.value.ptrvalue;
5145                                 if (biop != NULL) {
5146                                     orp = biop->org;
5147                                     if (orp != NULL) {
5148                                         taxname = orp->taxname;
5149                                     }
5150                                 }
5151                             }
5152                         }
5153                     }
5154                 }
5155             }
5156             if (title != NULL) {
5157                 /*
5158                 if (! StringHasNoText (taxname)) {
5159                     diff = LabelCopy(buf, taxname, buflen);
5160                     buflen -= diff;
5161                     buf += diff;
5162                     diff = LabelCopy(buf, " ", buflen);
5163                     buflen -= diff;
5164                     buf += diff;
5165                     diff = LabelCopy(buf, title, buflen);
5166                 } else {
5167                     diff = LabelCopy(buf, title, buflen);
5168                 }
5169                 */
5170               ttl = title;
5171               pfx = NULL;
5172               if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5173                     diff = LabelCopy (buf, pfx, buflen);
5174                     buflen -= diff;
5175                     buf += diff;
5176                 }
5177                 diff = LabelCopy (buf, ttl, buflen);
5178                 if (organism == NULL && taxname != NULL) {
5179                     organism = taxname;
5180                     iip = NULL;
5181                 }
5182             } else if (!htg_tech) {
5183                 if (bsp->repr == Seq_repr_seg) {
5184                     title = SimpleSegSeqTitle (bsp);
5185                 }
5186                 if (title == NULL) {
5187                     title = UseOrgMods(bsp, NULL, tech, FALSE);
5188                 }
5189                 ttl = title;
5190                 pfx = NULL;
5191                 if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5192                     diff = LabelCopy (buf, pfx, buflen);
5193                     buflen -= diff;
5194                     buf += diff;
5195                 }
5196                 if (ttl != NULL) {
5197                     diff = LabelCopy (buf, ttl, buflen);
5198                 } else {
5199                     diff = LabelCopy (buf, "No definition line found", buflen);
5200                 }
5201             }
5202         }
5203     }
5204     if (title != NULL) {
5205       TrimNonPeriodPunctuationFromEnd (title);
5206     }
5207     buflen -= diff;
5208     buf += diff;
5209     if (htg_tech) {
5210         if (tech == MI_TECH_htgs_0)
5211             phase = 0;
5212         else
5213             phase = (Uint4)(tech - MI_TECH_htgs_1 + 1);
5214         if (title == NULL|| *title == '\0') {
5215             title = UseOrgMods(bsp, NULL, tech, htgs_pooled_multiclone);
5216             organism = NULL;
5217             if (title != NULL) {
5218                 ttl = title;
5219                 pfx = NULL;
5220                 if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5221                     diff = LabelCopy (buf, pfx, buflen);
5222                     buflen -= diff;
5223                     buf += diff;
5224                 }
5225                 diff = LabelCopy (buf, ttl, buflen);
5226                 buflen -= diff;
5227                 buf += diff;
5228             }
5229         }
5230         if (phase == 3)
5231         {
5232             if (title) {
5233                 if (title && StringStr(title, "complete sequence") == NULL) {
5234                     diff = LabelCopy(buf, ", complete sequence", buflen);
5235                     buflen -= diff;
5236                     buf += diff;
5237                 }
5238             }
5239         } else {
5240             doit = FALSE;
5241             if (phase == 0) {
5242                 if (StringStr(title, "LOW-PASS") == NULL) {
5243                     doit = TRUE;
5244                     i = 0;
5245                 }
5246             } else {
5247                 if (htgs_draft) {
5248                     if (StringStr(title, "WORKING DRAFT") == NULL) {
5249                         doit = TRUE;
5250                         i = 1;
5251                     }
5252                 } else if (! htgs_cancelled) {
5253                     if (StringStr(title, "SEQUENCING IN") == NULL) {
5254                         doit = TRUE;
5255                         i = 2;
5256                     }
5257                 }
5258             }
5259             if (doit)
5260             {
5261                 if (diff != 0) {
5262                     diff = LabelCopy(buf, ", ", buflen);
5263                     buflen -= diff;
5264                     buf += diff;
5265                 }
5266                 diff = LabelCopy(buf, htg_phrase[i], buflen);
5267                 buflen -= diff;
5268                 buf += diff;
5269             }
5270             if ((phase != 0) && (bsp->repr == Seq_repr_delta)) {
5271                 if (CountGapsInDeltaSeq(bsp,
5272                         &num_segs, &num_gaps, NULL, NULL, NULL, 0))
5273                 {
5274                     if (num_gaps > 0) {
5275                         sprintf(tbuf, ", %ld %s pieces", (long)(num_gaps + 1), htgs[phase - 1]);
5276                     } else {
5277                         /*
5278                         sprintf(tbuf, ", %ld %s piece", (long)(num_gaps + 1), htgs[phase - 1]);
5279                         */
5280                     }
5281                     diff = LabelCopy(buf, tbuf, buflen);
5282                     buflen -= diff;
5283                     buf += diff;
5284                 }
5285             }
5286             else if (phase != 0) {
5287                 /*
5288                 sprintf(tbuf, ", in %s pieces", htgs[phase-1]);
5289                 diff = LabelCopy(buf, tbuf, buflen);
5290                 buflen -= diff;
5291                 buf += diff;
5292                 */
5293             }
5294         }
5295     } else if (tech == MI_TECH_est || tech == MI_TECH_sts || tech == MI_TECH_survey || tech == MI_TECH_wgs) {
5296         if (title == NULL|| *title == '\0') {
5297             title = UseOrgMods(bsp, NULL, tech, FALSE);
5298             organism = NULL;
5299             if (title != NULL) {
5300                 ttl = title;
5301                 pfx = NULL;
5302                 if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5303                     diff = LabelCopy (buf, pfx, buflen);
5304                     buflen -= diff;
5305                     buf += diff;
5306                 }
5307                 diff = LabelCopy (buf, ttl, buflen);
5308                 buflen -= diff;
5309                 buf += diff;
5310             }
5311         }
5312         if (tech == MI_TECH_est) {
5313             if (title) {
5314                 if (title && StringStr(title, "mRNA sequence") == NULL) {
5315                     diff = LabelCopy(buf, ", mRNA sequence", buflen);
5316                     buflen -= diff;
5317                     buf += diff;
5318                 }
5319             }
5320         } else if (tech == MI_TECH_sts) {
5321             if (title) {
5322                 if (title && StringStr(title, "sequence tagged site") == NULL) {
5323                     diff = LabelCopy(buf, ", sequence tagged site", buflen);
5324                     buflen -= diff;
5325                     buf += diff;
5326                 }
5327             }
5328         } else if (tech == MI_TECH_survey) {
5329             if (title) {
5330                 if (title && StringStr(title, "genomic survey sequence") == NULL) {
5331                     diff = LabelCopy(buf, ", genomic survey sequence", buflen);
5332                     buflen -= diff;
5333                     buf += diff;
5334                 }
5335             }
5336         } else if (tech == MI_TECH_wgs) {
5337             if (title) {
5338                 vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
5339                 if (vnp != NULL) {
5340                     biop = (BioSourcePtr) vnp->data.ptrvalue;
5341                     if (biop != NULL) {
5342                         genome = biop->genome;
5343                         if (genome < kNumWGSOrganelles) {
5344                             orgnl = organelleForWGS [genome];
5345                         }
5346                     }
5347                 }
5348                 if (wgsmaster) {
5349                     if (title && StringStr (title, "whole genome shotgun sequencing project") == NULL) {
5350                         diff = LabelCopy(buf, " whole genome shotgun sequencing project", buflen);
5351                         buflen -= diff;
5352                         buf += diff;
5353                     }
5354                 } else if (title && StringStr (title, "whole genome shotgun sequence") == NULL) {
5355                     if (orgnl != NULL && StringStr (title, orgnl) == NULL) {
5356                         diff = LabelCopy(buf, " ", buflen);
5357                         buflen -= diff;
5358                         buf += diff;
5359                         diff = LabelCopy(buf, orgnl, buflen);
5360                         buflen -= diff;
5361                         buf += diff;
5362                     }
5363                     diff = LabelCopy(buf, ", whole genome shotgun sequence", buflen);
5364                     buflen -= diff;
5365                     buf += diff;
5366                 }
5367             }
5368         }
5369     }
5370     if (iip == NULL && organism != NULL) {
5371         doit = TRUE;
5372         if (title) {
5373             if (StringStr(title, organism) != NULL)
5374                 doit = FALSE;
5375         }
5376         if (doit)
5377             LabelCopyExtra(buf, organism, buflen, " [", "]");
5378     }
5379         MemFree(title);
5380     return TRUE;
5381 }
CreateDefLineEx(ItemInfoPtr iip,BioseqPtr bsp,CharPtr buf,Uint4 buflen,Uint1 tech,CharPtr accession,CharPtr organism,Boolean ignoreTitle)5382 NLM_EXTERN Boolean CreateDefLineEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr buf, Uint4 buflen, Uint1 tech,
5383                                     CharPtr accession, CharPtr organism, Boolean ignoreTitle)
5384 {
5385   return CreateDefLineExEx (iip, bsp, buf, buflen, tech, accession, organism, ignoreTitle, FALSE);
5386 }
CreateDefLine(ItemInfoPtr iip,BioseqPtr bsp,CharPtr buf,Uint4 buflen,Uint1 tech,CharPtr accession,CharPtr organism)5387 NLM_EXTERN Boolean CreateDefLine (ItemInfoPtr iip, BioseqPtr bsp, CharPtr buf, Uint4 buflen,
5388                                   Uint1 tech, CharPtr accession, CharPtr organism)
5389 {
5390   return CreateDefLineExEx (iip, bsp, buf, buflen, tech, accession, organism, FALSE, FALSE);
5391 }
5392 /*****************************************************************************
5393 *
5394 *   FastaSeqPort(bsp, is_na, do_virtual)
5395 *       opens a SeqPort for a fasta output of bsp
5396 *
5397 *****************************************************************************/
FastaSeqPort(BioseqPtr bsp,Boolean is_na,Boolean do_virtual,Uint1 code)5398 NLM_EXTERN SeqPortPtr FastaSeqPort(BioseqPtr bsp, Boolean is_na, Boolean do_virtual,
5399                                    Uint1 code)
5400 {
5401     SeqPortPtr spp = NULL;
5402     if (bsp == NULL) return spp;
5403     spp = SeqPortNew(bsp, 0, -1, 0, code);
5404     if (do_virtual)
5405         SeqPortSet_do_virtual(spp, TRUE);
5406     SeqPortSeek(spp, 0, SEEK_SET);
5407     return spp;
5408 }
5409 /*****************************************************************************
5410 *
5411 *   FastaSeqPortEx(bsp, is_na, do_virtual, slp)
5412 *       opens a SeqPort for a fasta output of bsp constrained to slp
5413 *
5414 *****************************************************************************/
FastaSeqPortEx(BioseqPtr bsp,Boolean is_na,Boolean do_virtual,Uint1 code,SeqLocPtr slp)5415 NLM_EXTERN SeqPortPtr FastaSeqPortEx(BioseqPtr bsp, Boolean is_na, Boolean do_virtual,
5416                                      Uint1 code, SeqLocPtr slp)
5417 {
5418     SeqPortPtr spp = NULL;
5419     if (bsp == NULL) return spp;
5420     if (slp == NULL) return FastaSeqPort (bsp, is_na, do_virtual, code);
5421     spp = SeqPortNew(bsp, SeqLocStart(slp), SeqLocStop(slp),
5422             SeqLocStrand(slp), code);
5423     if (do_virtual)
5424         SeqPortSet_do_virtual(spp, TRUE);
5425     SeqPortSeek(spp, 0, SEEK_SET);
5426     return spp;
5427 }
5428 /*****************************************************************************
5429 *
5430 *   FastaSeqLine(spp, buf, linelen)
5431 *     an open seqport is passed in.
5432 *     fills buf with linelen bases
5433 *     assumes buf[linelen] = '\0'
5434 *     returns FALSE when no more residues to print
5435 *
5436 *****************************************************************************/
FastaSeqLine(SeqPortPtr spp,CharPtr buf,Int2 linelen,Boolean is_na)5437 NLM_EXTERN Boolean FastaSeqLine(SeqPortPtr spp, CharPtr buf, Int2 linelen, Boolean is_na)
5438 {
5439     return FastaSeqLineEx(spp, buf, linelen, is_na, FALSE);
5440 }
FastaSeqLineEx(SeqPortPtr spp,CharPtr buf,Int2 linelen,Boolean is_na,Boolean do_virtual)5441 NLM_EXTERN Boolean FastaSeqLineEx(SeqPortPtr spp, CharPtr buf, Int2 linelen, Boolean is_na, Boolean
5442 do_virtual)
5443 {
5444     Int2 ctr = 0;
5445     Uint1 residue;
5446     Int4 pos;
5447     Char idbuf[128];
5448     if ((spp == NULL) || (buf == NULL)) return FALSE;
5449     while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF)
5450     {
5451         if (! IS_residue(residue))
5452         {
5453             if (residue == INVALID_RESIDUE)
5454             {
5455                 if (is_na)
5456                     residue = 'N';
5457                 else
5458                     residue = 'X';
5459                 FastaId(spp->bsp, idbuf, 39);
5460                 pos = SeqPortTell(spp);
5461                 ErrPostEx(SEV_ERROR,0,0, "ToFastA: Invalid residue at position %ld in %s",
5462                     (long) pos, idbuf);
5463             }
5464             else
5465             {
5466                 if (residue == SEQPORT_VIRT)  /* gap */
5467                 {
5468                     if (ctr)  /* got some residues already */
5469                     {
5470                         buf[ctr] = '\0';
5471                         SeqPortSeek(spp, -1, SEEK_CUR);  /* back up one */
5472                                      /* can only seek to a real residue, so go past it */
5473                                                 residue = SeqPortGetResidue(spp);
5474                                                 if (residue == SEQPORT_VIRT)
5475                                                    SeqPortSeek(spp, -1, SEEK_CUR);
5476                         return TRUE;
5477                     }
5478                     else if (! do_virtual)       /* first one */
5479                     {
5480                         buf[ctr] = '-';
5481                         buf[ctr + 1] = '\0';
5482                         return TRUE;
5483                     }
5484                 }
5485                 residue = '\0';
5486             }
5487         }
5488         if (residue != '\0')
5489         {
5490             buf[ctr] = residue;
5491             ctr++;
5492             if (ctr == linelen)
5493             {
5494                 buf[ctr] = '\0';
5495                 return TRUE;
5496             }
5497         }
5498     }
5499     buf[ctr] = '\0';
5500     if (ctr)
5501         return TRUE;
5502     else
5503         return FALSE;
5504 }
5505 /*****************************************************************************
5506 *
5507 *   NC_Cleanup (entityID, ptr)
5508 *     internal function for genome RefSeq processing
5509 *
5510 *****************************************************************************/
RemoveAllTitles(GatherObjectPtr gop)5511 static Boolean RemoveAllTitles (GatherObjectPtr gop)
5512 {
5513   ObjValNodePtr  ovp;
5514   SeqDescrPtr    sdp;
5515   if (gop == NULL ||
5516       gop->itemtype != OBJ_SEQDESC ||
5517       gop->subtype != Seq_descr_title) return TRUE;
5518   sdp = (SeqDescrPtr) gop->dataptr;
5519   if (sdp == NULL || sdp->extended == 0) return TRUE;
5520   ovp = (ObjValNodePtr) sdp;
5521   ovp->idx.deleteme = TRUE;
5522   return TRUE;
5523 }
AddNcTitles(GatherObjectPtr gop)5524 static Boolean AddNcTitles (GatherObjectPtr gop)
5525 {
5526   BioseqPtr     bsp;
5527   Char          buf [512];
5528   Boolean       is_nc;
5529   /*
5530   MolInfoPtr    mip;
5531   SeqDescrPtr   sdp;
5532   */
5533   SeqIdPtr      sip;
5534   CharPtr       str;
5535   TextSeqIdPtr  tsip;
5536   if (gop == NULL ||
5537       gop->itemtype != OBJ_BIOSEQ) return TRUE;
5538   bsp = (BioseqPtr) gop->dataptr;
5539   if (bsp == NULL) return TRUE;
5540   is_nc = FALSE;
5541   for (sip = bsp->id; sip != NULL; sip = sip->next) {
5542     if (sip->choice == SEQID_OTHER) {
5543       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
5544       if (tsip != NULL && tsip->accession != NULL) {
5545         if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
5546           is_nc = TRUE;
5547         }
5548       }
5549     }
5550   }
5551   if (! is_nc) return TRUE;
5552   if (NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE)) {
5553     if (! StringHasNoText (buf)) {
5554       str = StringSaveNoNull (buf);
5555       if (str != NULL) {
5556         SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
5557       }
5558     }
5559   }
5560   /*
5561   for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
5562     if (sdp->choice == Seq_descr_molinfo) {
5563       mip = (MolInfoPtr) sdp->data.ptrvalue;
5564       if (mip != NULL &&
5565           mip->biomol == MOLECULE_TYPE_GENOMIC &&
5566           mip->completeness == 1) {
5567         mip->completeness = 0;
5568       }
5569     }
5570   }
5571   */
5572   return TRUE;
5573 }
ClearKeywordsProc(SeqDescrPtr sdp,Pointer userdata)5574 static void ClearKeywordsProc (SeqDescrPtr sdp, Pointer userdata)
5575 {
5576   GBBlockPtr     gbp;
5577   ObjValNodePtr  ovn;
5578   if (sdp == NULL || sdp->choice != Seq_descr_genbank) return;
5579   gbp = (GBBlockPtr) sdp->data.ptrvalue;
5580   if (gbp == NULL) return;
5581   gbp->keywords = ValNodeFreeData (gbp->keywords);
5582   if (gbp->extra_accessions == NULL && gbp->source == NULL &&
5583       gbp->keywords == NULL && gbp->origin == NULL &&
5584       gbp->date == NULL && gbp->entry_date == NULL &&
5585       gbp->div == NULL && gbp->taxonomy == NULL) {
5586   }
5587   if (sdp->extended == 0) return;
5588   ovn = (ObjValNodePtr) sdp;
5589   ovn->idx.deleteme = TRUE;
5590 }
ClearGenBankKeywords(Uint2 entityID,Pointer ptr)5591 NLM_EXTERN void ClearGenBankKeywords (Uint2 entityID, Pointer ptr)
5592 {
5593   SeqEntryPtr  sep;
5594   if (entityID == 0) {
5595     entityID = ObjMgrGetEntityIDForPointer (ptr);
5596   }
5597   if (entityID == 0) return;
5598   sep = GetTopSeqEntryForEntityID (entityID);
5599   VisitDescriptorsInSep (sep, NULL, ClearKeywordsProc);
5600   DeleteMarkedObjects (entityID, 0, NULL);
5601 }
5602 
IsNcCallback(BioseqPtr bsp,Pointer userdata)5603 static void IsNcCallback (BioseqPtr bsp, Pointer userdata)
5604 
5605 {
5606   BoolPtr       is_ncP;
5607   SeqIdPtr      sip;
5608   TextSeqIdPtr  tsip;
5609 
5610   if (bsp == NULL) return;
5611   is_ncP = (BoolPtr) userdata;
5612   if (is_ncP == NULL) return;
5613 
5614   for (sip = bsp->id; sip != NULL; sip = sip->next) {
5615     if (sip->choice != SEQID_OTHER) continue;
5616     tsip = (TextSeqIdPtr) sip->data.ptrvalue;
5617     if (tsip == NULL) continue;
5618     if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
5619       *is_ncP = TRUE;
5620     }
5621   }
5622 }
5623 
NC_Cleanup(Uint2 entityID,Pointer ptr)5624 NLM_EXTERN void NC_Cleanup (Uint2 entityID, Pointer ptr)
5625 {
5626   Boolean      objMgrFilt [OBJ_MAX];
5627   Boolean      is_nc = FALSE;
5628   SeqEntryPtr  sep;
5629 
5630   if (entityID == 0) {
5631     entityID = ObjMgrGetEntityIDForPointer (ptr);
5632   }
5633   if (entityID == 0) return;
5634 
5635   sep = GetTopSeqEntryForEntityID (entityID);
5636   VisitBioseqsInSep (sep, (Pointer) &is_nc, IsNcCallback);
5637   if (! is_nc) return;
5638 
5639   AssignIDsInEntity (entityID, 0, NULL);
5640   MemSet ((Pointer) objMgrFilt, FALSE, sizeof (objMgrFilt));
5641   objMgrFilt [OBJ_SEQDESC] = TRUE;
5642   GatherObjectsInEntity (entityID, 0, NULL, RemoveAllTitles, NULL, objMgrFilt);
5643   VisitDescriptorsInSep (sep, NULL, ClearKeywordsProc);
5644   DeleteMarkedObjects (entityID, 0, NULL);
5645   MemSet ((Pointer) objMgrFilt, FALSE, sizeof (objMgrFilt));
5646   objMgrFilt [OBJ_BIOSEQ] = TRUE;
5647   GatherObjectsInEntity (entityID, 0, NULL, AddNcTitles, NULL, objMgrFilt);
5648 }
5649 
InstantiateNCTitle(Uint2 entityID,Pointer ptr)5650 NLM_EXTERN void InstantiateNCTitle (Uint2 entityID, Pointer ptr)
5651 {
5652   Boolean      objMgrFilt [OBJ_MAX];
5653 
5654   if (entityID == 0) {
5655     entityID = ObjMgrGetEntityIDForPointer (ptr);
5656   }
5657   if (entityID == 0) return;
5658 
5659   AssignIDsInEntity (entityID, 0, NULL);
5660   MemSet ((Pointer) objMgrFilt, FALSE, sizeof (objMgrFilt));
5661   objMgrFilt [OBJ_BIOSEQ] = TRUE;
5662   GatherObjectsInEntity (entityID, 0, NULL, AddNcTitles, NULL, objMgrFilt);
5663 }
5664 
AddNmTitles(GatherObjectPtr gop)5665 static Boolean AddNmTitles (GatherObjectPtr gop)
5666 {
5667   BioseqPtr     bsp;
5668   Char          buf [512];
5669   Boolean       is_nm;
5670   SeqIdPtr      sip;
5671   CharPtr       str;
5672   TextSeqIdPtr  tsip;
5673   if (gop == NULL ||
5674       gop->itemtype != OBJ_BIOSEQ) return TRUE;
5675   bsp = (BioseqPtr) gop->dataptr;
5676   if (bsp == NULL) return TRUE;
5677   is_nm = FALSE;
5678   for (sip = bsp->id; sip != NULL; sip = sip->next) {
5679     if (sip->choice == SEQID_OTHER) {
5680       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
5681       if (tsip != NULL && tsip->accession != NULL) {
5682         if (StringNICmp (tsip->accession, "NM_", 3) == 0) {
5683           is_nm = TRUE;
5684         } else if (StringNICmp (tsip->accession, "XM_", 3) == 0) {
5685           is_nm = TRUE;
5686         }
5687       }
5688     }
5689   }
5690   if (! is_nm) return TRUE;
5691   if (NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE)) {
5692     if (! StringHasNoText (buf)) {
5693       str = StringSaveNoNull (buf);
5694       if (str != NULL) {
5695         SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
5696       }
5697     }
5698   }
5699   return TRUE;
5700 }
5701 
InstantiateNMTitles(Uint2 entityID,Pointer ptr)5702 NLM_EXTERN void InstantiateNMTitles (Uint2 entityID, Pointer ptr)
5703 {
5704   Boolean      objMgrFilt [OBJ_MAX];
5705 
5706   if (entityID == 0) {
5707     entityID = ObjMgrGetEntityIDForPointer (ptr);
5708   }
5709   if (entityID == 0) return;
5710 
5711   AssignIDsInEntity (entityID, 0, NULL);
5712   MemSet ((Pointer) objMgrFilt, FALSE, sizeof (objMgrFilt));
5713   objMgrFilt [OBJ_BIOSEQ] = TRUE;
5714   GatherObjectsInEntity (entityID, 0, NULL, AddNmTitles, NULL, objMgrFilt);
5715 }
5716 
ClearProtTitlesProc(BioseqPtr bsp,Pointer userdata)5717 static void ClearProtTitlesProc (BioseqPtr bsp, Pointer userdata)
5718 {
5719   ObjValNodePtr  ovp;
5720   SeqDescrPtr    sdp;
5721   SeqIdPtr       sip;
5722   if (bsp == NULL) return;
5723   if (! ISA_aa (bsp->mol)) return;
5724   for (sip = bsp->id; sip != NULL; sip = sip->next) {
5725     if (sip->choice == SEQID_OTHER) return;
5726   }
5727   for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
5728     if (sdp->choice == Seq_descr_title) {
5729       if (sdp->extended != 0) {
5730         ovp = (ObjValNodePtr) sdp;
5731         ovp->idx.deleteme = TRUE;
5732       }
5733     }
5734   }
5735 }
ClearProtTitlesNPS(BioseqSetPtr bssp,Pointer userdata)5736 static void ClearProtTitlesNPS (BioseqSetPtr bssp, Pointer userdata)
5737 {
5738   if (bssp->_class != BioseqseqSet_class_nuc_prot) return;
5739   VisitBioseqsInSet (bssp, NULL, ClearProtTitlesProc);
5740 }
ClearProteinTitlesInNucProts(Uint2 entityID,Pointer ptr)5741 NLM_EXTERN void ClearProteinTitlesInNucProts (Uint2 entityID, Pointer ptr)
5742 {
5743   SeqEntryPtr  sep;
5744   if (entityID == 0) {
5745     entityID = ObjMgrGetEntityIDForPointer (ptr);
5746   }
5747   if (entityID == 0) return;
5748   sep = GetTopSeqEntryForEntityID (entityID);
5749   VisitSetsInSep (sep, NULL, ClearProtTitlesNPS);
5750   DeleteMarkedObjects (entityID, 0, NULL);
5751 }
5752 
5753 
AddProtTitles(BioseqPtr bsp,Pointer userdata)5754 static void AddProtTitles (BioseqPtr bsp, Pointer userdata)
5755 {
5756   Char         buf [512];
5757   SeqDescrPtr  sdp;
5758   SeqIdPtr     sip;
5759   CharPtr      str;
5760   if (bsp == NULL) return;
5761   if (! ISA_aa (bsp->mol)) return;
5762   for (sip = bsp->id; sip != NULL; sip = sip->next) {
5763     if (sip->choice == SEQID_PIR ||
5764         sip->choice == SEQID_SWISSPROT ||
5765         sip->choice == SEQID_PATENT ||
5766         sip->choice == SEQID_PRF ||
5767         sip->choice == SEQID_PDB) return;
5768   }
5769   for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
5770     if (sdp->choice == Seq_descr_title) return;
5771   }
5772   if (NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE)) {
5773     if (! StringHasNoText (buf)) {
5774       str = StringSaveNoNull (buf);
5775       if (str != NULL) {
5776         SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
5777       }
5778     }
5779   }
5780 }
InstantiateProteinTitles(Uint2 entityID,Pointer ptr)5781 NLM_EXTERN void InstantiateProteinTitles (Uint2 entityID, Pointer ptr)
5782 {
5783   SeqEntryPtr  sep;
5784   if (entityID == 0) {
5785     entityID = ObjMgrGetEntityIDForPointer (ptr);
5786   }
5787   if (entityID == 0) return;
5788   AssignIDsInEntity (entityID, 0, NULL);
5789   sep = GetTopSeqEntryForEntityID (entityID);
5790   VisitBioseqsInSep (sep, NULL, AddProtTitles);
5791 }
5792 
5793 
UpdateProteinTitle(BioseqPtr bsp)5794 NLM_EXTERN void UpdateProteinTitle (BioseqPtr bsp)
5795 {
5796   Char         buf [512];
5797   SeqDescrPtr  sdp;
5798   ObjValNodePtr ovp;
5799   SeqIdPtr     sip;
5800   CharPtr      str;
5801 
5802   if (bsp == NULL || !ISA_aa (bsp->mol)) {
5803     return;
5804   }
5805 
5806   /* we don't create protein titles for these IDs */
5807   for (sip = bsp->id; sip != NULL; sip = sip->next) {
5808     if (sip->choice == SEQID_PIR ||
5809         sip->choice == SEQID_SWISSPROT ||
5810         sip->choice == SEQID_PATENT ||
5811         sip->choice == SEQID_PRF ||
5812         sip->choice == SEQID_PDB) return;
5813   }
5814 
5815   sdp = BioseqGetSeqDescr (bsp, Seq_descr_title, NULL);
5816   if (sdp == NULL) {
5817     /* we only update a title if it already exists */
5818     return;
5819   }
5820   if (sdp->extended) {
5821     ovp = (ObjValNodePtr) sdp;
5822     ovp->idx.deleteme = TRUE;
5823     DeleteMarkedObjects (bsp->idx.entityID, OBJ_BIOSEQ, bsp);
5824   }
5825 
5826   if (NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE)) {
5827     if (! StringHasNoText (buf)) {
5828       str = StringSaveNoNull (buf);
5829       if (str != NULL) {
5830         SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
5831       }
5832     }
5833   }
5834 }
5835 
5836 
5837 /* NEW DEFLINE GENERATOR */
5838 
5839 typedef struct deflinestruct {
5840   /* instance variables */
5841   ItemInfoPtr  m_iip;
5842   BioseqPtr    m_bioseq;
5843 
5844   /* ignore existing title is forced for certain types */
5845   Boolean  m_reconstruct;
5846   Boolean  m_allprotnames;
5847 
5848   Boolean  m_gpipemode;
5849   Boolean  m_devmode;
5850 
5851   /* seq-inst fields */
5852   Boolean m_is_na;
5853   Boolean m_is_aa;
5854 
5855   Boolean m_is_seg;
5856   Boolean m_is_delta;
5857   Boolean m_is_virtual;
5858   Boolean m_is_map;
5859   Uint1   m_topology;
5860 
5861   /* seq-id fields */
5862   Boolean m_is_nc;
5863   Boolean m_is_nm;
5864   Boolean m_is_nr;
5865   Boolean m_is_patent;
5866   Boolean m_is_pdb;
5867   Boolean m_is_wp;
5868   Boolean m_third_party;
5869   Boolean m_wgs_master;
5870   Boolean m_tsa_master;
5871   Boolean m_tls_master;
5872 
5873   CharPtr m_general_str;
5874   CharPtr m_patent_country;
5875   CharPtr m_patent_number;
5876   int     m_patent_sequence;
5877 
5878   int     m_pdb_chain;
5879 
5880   /* molinfo fields */
5881   Uint1   m_mi_biomol;
5882   Uint1   m_mi_tech;
5883   Uint1   m_mi_completeness;
5884 
5885   Boolean m_htg_tech;
5886   Boolean m_htgs_unfinished;
5887   Boolean m_is_tls;
5888   Boolean m_is_tsa;
5889   Boolean m_is_wgs;
5890   Boolean m_is_est_sts_gss;
5891 
5892   Boolean m_use_biosrc;
5893 
5894   /* genbank or embl block keyword fields */
5895   Boolean m_htgs_cancelled;
5896   Boolean m_htgs_draft;
5897   Boolean m_htgs_pooled;
5898   Boolean m_tpa_exp;
5899   Boolean m_tpa_inf;
5900   Boolean m_tpa_reasm;
5901   Boolean m_unordered;
5902 
5903   /* pdb block fields */
5904   CharPtr m_pdb_compound;
5905 
5906   /* biosource fields */
5907   CharPtr m_taxname;
5908   Boolean m_multispecies;
5909   int     m_genome;
5910   Boolean m_is_plasmid;
5911   Boolean m_is_chromosome;
5912 
5913   CharPtr m_organelle;
5914 
5915   CharPtr m_first_super_kingdom;
5916   CharPtr m_second_super_kingdom;
5917   Boolean m_is_cross_kingdom;
5918 
5919   /* subsource fields */
5920   CharPtr m_chromosome;
5921   CharPtr m_clone;
5922   Boolean m_has_clone;
5923   CharPtr m_map;
5924   CharPtr m_plasmid;
5925   CharPtr m_segment;
5926 
5927   /* orgmod fields */
5928   CharPtr m_breed;
5929   CharPtr m_cultivar;
5930   CharPtr m_isolate;
5931   CharPtr m_strain;
5932 
5933   /* map fields */
5934   CharPtr m_enzyme;
5935 
5936   /* user object fields */
5937   Boolean m_is_unverified;
5938   CharPtr m_targeted_locus;
5939 
5940   /* comment fields */
5941   Boolean m_is_pseudogene;
5942 
5943   /* exception fields */
5944   TextFsaPtr m_low_quality_fsa;
5945 } DefLineData, PNTR DefLinePtr;
5946 
x_CDShasLowQualityException(DefLinePtr dlp,SeqFeatPtr sfp)5947 static Boolean x_CDShasLowQualityException (
5948   DefLinePtr dlp,
5949   SeqFeatPtr sfp
5950 )
5951 
5952 {
5953   Char        ch;
5954   TextFsaPtr  fsa;
5955   ValNodePtr  matches;
5956   CharPtr     ptr;
5957   Int4        state;
5958 
5959   if (dlp == NULL || sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return FALSE;
5960 
5961   if (! sfp->excpt) return FALSE;
5962   if (StringHasNoText (sfp->except_text)) return FALSE;
5963 
5964   fsa = dlp->m_low_quality_fsa;
5965   if (fsa == NULL) return FALSE;
5966 
5967   state = 0;
5968   matches = NULL;
5969   for (ptr = sfp->except_text, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
5970     state = TextFsaNext (fsa, state, ch, &matches);
5971     if (matches != NULL) {
5972       return TRUE;
5973     }
5974   }
5975 
5976   return FALSE;
5977 }
5978 
x_OrganelleName(DefLinePtr dlp,Boolean has_plasmid,Boolean virus_or_phage,Boolean wgs_suffix)5979 static CharPtr x_OrganelleName (
5980   DefLinePtr dlp,
5981   Boolean has_plasmid,
5982   Boolean virus_or_phage,
5983   Boolean wgs_suffix
5984 )
5985 
5986 {
5987   CharPtr  result = NULL;
5988 
5989   if (dlp == NULL) return NULL;
5990 
5991   switch (dlp->m_genome) {
5992       case GENOME_chloroplast :
5993         result = "chloroplast";
5994         break;
5995       case GENOME_chromoplast :
5996         result = "chromoplast";
5997         break;
5998       case GENOME_kinetoplast :
5999         result = "kinetoplast";
6000         break;
6001       case GENOME_mitochondrion :
6002         {
6003           if (has_plasmid || wgs_suffix) {
6004             result = "mitochondrial";
6005           } else {
6006             result = "mitochondrion";
6007           }
6008           break;
6009         }
6010       case GENOME_plastid :
6011         result = "plastid";
6012         break;
6013       case GENOME_macronuclear :
6014         {
6015           result = "macronuclear";
6016           break;
6017         }
6018       case GENOME_extrachrom :
6019         {
6020           if (! wgs_suffix) {
6021             result = "extrachromosomal";
6022           }
6023           break;
6024         }
6025       case GENOME_plasmid :
6026         {
6027           if (! wgs_suffix) {
6028             result = "plasmid";
6029           }
6030           break;
6031         }
6032       /* transposon and insertion-seq are obsolete */
6033       case GENOME_cyanelle :
6034         result = "cyanelle";
6035         break;
6036       case GENOME_proviral :
6037         {
6038           if (! virus_or_phage) {
6039             if (has_plasmid || wgs_suffix) {
6040               result = "proviral";
6041             } else {
6042               result = "provirus";
6043             }
6044           }
6045           break;
6046         }
6047       case GENOME_virion :
6048         {
6049           if (! virus_or_phage) {
6050             result = "virus";
6051           }
6052           break;
6053         }
6054       case GENOME_nucleomorph :
6055         {
6056           if (! wgs_suffix) {
6057             result = "nucleomorph";
6058           }
6059           break;
6060         }
6061       case GENOME_apicoplast :
6062         result = "apicoplast";
6063         break;
6064       case GENOME_leucoplast :
6065         result = "leucoplast";
6066         break;
6067       case GENOME_proplastid :
6068         result = "proplastid";
6069         break;
6070       case GENOME_endogenous_virus :
6071         result = "endogenous virus";
6072         break;
6073       case GENOME_hydrogenosome :
6074         result = "hydrogenosome";
6075         break;
6076       case GENOME_chromosome :
6077         result = "chromosome";
6078         break;
6079       case GENOME_chromatophore :
6080         result = "chromatophore";
6081         break;
6082   }
6083 
6084   return result;
6085 }
6086 
6087 /* set instance variables from Seq-inst, Seq-ids, MolInfo, etc., but not BioSource */
x_SetFlags(DefLinePtr dlp)6088 static void x_SetFlags (
6089   DefLinePtr dlp
6090 )
6091 
6092 {
6093   BioSourcePtr    biop;
6094   BioseqPtr       bsp;
6095   IdPatPtr        cit;
6096   ValNodePtr      compound;
6097   DbtagPtr        dbt;
6098   EMBLBlockPtr    ebp;
6099   GBBlockPtr      gbp;
6100   DbtagPtr        general;
6101   ValNodePtr      keywords;
6102   size_t          len;
6103   MolInfoPtr      mip;
6104   Int2            num_super_kingdom = 0;
6105   ObjectIdPtr     oip;
6106   OrgNamePtr      onp;
6107   OrgRefPtr       orp;
6108   PdbBlockPtr     pbp;
6109   PDBSeqIdPtr     pdbip;
6110   PatentSeqIdPtr  psip;
6111   RsiteRefPtr     rrp;
6112   SeqDescrPtr     sdp;
6113   SeqFeatPtr      sfp;
6114   SeqIdPtr        sip;
6115   CharPtr         str;
6116   Boolean         super_kingdoms_different = FALSE;
6117   TaxElementPtr   tep;
6118   TextSeqIdPtr    tsip;
6119   UserFieldPtr    ufp;
6120   UserObjectPtr   uop;
6121   ValNodePtr      vnp;
6122 
6123   if (dlp == NULL) return;
6124 
6125   bsp = dlp->m_bioseq;
6126   if (bsp == NULL) return;
6127 
6128   dlp->m_is_na = (Boolean) ISA_na (bsp->mol);
6129   dlp->m_is_aa = (Boolean) ISA_aa (bsp->mol);
6130   dlp->m_topology = bsp->topology;
6131 
6132   dlp->m_is_seg = (Boolean) (bsp->repr == Seq_repr_seg);
6133   dlp->m_is_delta = (Boolean) (bsp->repr == Seq_repr_delta);
6134   dlp->m_is_virtual = (Boolean) (bsp->repr == Seq_repr_virtual);
6135   dlp->m_is_map =  (Boolean) (bsp->repr == Seq_repr_map);
6136 
6137   /* process Seq-ids */
6138   for (sip = bsp->id; sip != NULL; sip = sip->next) {
6139     switch (sip->choice) {
6140         case SEQID_OTHER :
6141           tsip = (TextSeqIdPtr) sip->data.ptrvalue;
6142           if (tsip != NULL && tsip->accession != NULL) {
6143             if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
6144               dlp->m_is_nc = TRUE;
6145             } else if (StringNICmp (tsip->accession, "NM_", 3) == 0) {
6146               dlp->m_is_nm = TRUE;
6147             } else if (StringNICmp (tsip->accession, "XM_", 3) == 0) {
6148               dlp->m_is_nm = TRUE;
6149             } else if (StringNICmp (tsip->accession, "NR_", 3) == 0) {
6150               dlp->m_is_nr = TRUE;
6151             } else if (StringNICmp (tsip->accession, "WP_", 3) == 0) {
6152               dlp->m_is_wp = TRUE;
6153             }
6154             len = StringLen (tsip->accession);
6155             if (len == 15) {
6156               if (StringCmp (tsip->accession + 9, "000000") == 0) {
6157                 dlp->m_wgs_master = TRUE;
6158               }
6159             } else if (len == 16) {
6160               if (StringCmp (tsip->accession + 9, "0000000") == 0) {
6161                 dlp->m_wgs_master = TRUE;
6162               }
6163             } else if (len == 17) {
6164               if (StringCmp (tsip->accession + 10, "0000000") == 0) {
6165                 dlp->m_wgs_master = TRUE;
6166               }
6167             }
6168           }
6169           break;
6170         case SEQID_GENBANK :
6171         case SEQID_EMBL :
6172         case SEQID_DDBJ :
6173           tsip = (TextSeqIdPtr) sip->data.ptrvalue;
6174           if (tsip != NULL && tsip->accession != NULL) {
6175             len = StringLen (tsip->accession);
6176             if (len == 12) {
6177               if (StringCmp (tsip->accession + 6, "000000") == 0) {
6178                 dlp->m_wgs_master = TRUE;
6179               }
6180             } else if (len == 13) {
6181               if (StringCmp (tsip->accession + 6, "0000000") == 0) {
6182                 dlp->m_wgs_master = TRUE;
6183               }
6184             } else if (len == 14) {
6185               if (StringCmp (tsip->accession + 6, "00000000") == 0) {
6186                 dlp->m_wgs_master = TRUE;
6187               }
6188             }
6189           }
6190           break;
6191         case SEQID_GENERAL :
6192           dbt = (DbtagPtr) sip->data.ptrvalue;
6193           if (dbt != NULL && (! IsSkippableDbtag (dbt))) {
6194             general = dbt;
6195             if (general != NULL) {
6196               oip = general->tag;
6197               if (oip != NULL) {
6198                 if (! StringHasNoText (oip->str)) {
6199                   dlp->m_general_str = oip->str;
6200                 }
6201               }
6202             }
6203           }
6204           break;
6205         case SEQID_TPG :
6206         case SEQID_TPE :
6207         case SEQID_TPD :
6208           dlp->m_third_party = TRUE;
6209           break;
6210         case SEQID_PDB :
6211           dlp->m_is_pdb = TRUE;
6212           pdbip = (PDBSeqIdPtr) sip->data.ptrvalue;
6213           if (pdbip && pdbip->chain > 32) {
6214             dlp->m_pdb_chain = pdbip->chain;
6215           }
6216           break;
6217         case SEQID_PATENT :
6218           dlp->m_is_patent = TRUE;
6219           psip = (PatentSeqIdPtr) sip->data.ptrvalue;
6220           if (psip != NULL) {
6221             dlp->m_patent_sequence = (int) psip->seqid;
6222             cit = psip->cit;
6223             if (cit != NULL) {
6224               dlp->m_patent_country = cit->country;
6225               if (StringDoesHaveText (cit->number)) {
6226                 dlp->m_patent_number = cit->number;
6227               } else if (StringDoesHaveText (cit->app_number)) {
6228                 dlp->m_patent_number = cit->app_number;
6229               }
6230             }
6231           }
6232           break;
6233         case SEQID_GPIPE :
6234           break;
6235         default :
6236           break;
6237     }
6238   }
6239 
6240   /* process MolInfo tech */
6241   sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_molinfo, NULL);
6242   if (sdp != NULL && sdp->choice == Seq_descr_molinfo) {
6243     mip = (MolInfoPtr) sdp->data.ptrvalue;
6244     if (mip != NULL) {
6245       dlp->m_mi_biomol = mip->biomol;
6246       dlp->m_mi_tech = mip->tech;
6247       dlp->m_mi_completeness = mip->completeness;
6248       switch (dlp->m_mi_tech) {
6249           case MI_TECH_htgs_0 :
6250           case MI_TECH_htgs_1 :
6251           case MI_TECH_htgs_2 :
6252             dlp->m_htgs_unfinished = TRUE;
6253             /* manufacture all titles for unfinished HTG sequences */
6254             dlp->m_reconstruct = TRUE;
6255             /* fall through */
6256           case MI_TECH_htgs_3 :
6257             dlp->m_htg_tech = TRUE;
6258             dlp->m_use_biosrc = TRUE;
6259             break;
6260           case MI_TECH_est :
6261           case MI_TECH_sts :
6262           case MI_TECH_survey :
6263             dlp->m_is_est_sts_gss = TRUE;
6264             dlp->m_use_biosrc = TRUE;
6265             break;
6266           case MI_TECH_wgs :
6267             dlp->m_is_wgs = TRUE;
6268             dlp->m_use_biosrc = TRUE;
6269             break;
6270           case MI_TECH_tsa :
6271             dlp->m_is_tsa = TRUE;
6272             dlp->m_use_biosrc = TRUE;
6273             if (dlp->m_is_virtual) {
6274               dlp->m_tsa_master = TRUE;
6275             }
6276             break;
6277           case MI_TECH_targeted :
6278             dlp->m_is_tls = TRUE;
6279             dlp->m_use_biosrc = TRUE;
6280             if (dlp->m_is_virtual) {
6281               dlp->m_tls_master = TRUE;
6282             }
6283             break;
6284           default :
6285             break;
6286       }
6287     }
6288   }
6289 
6290   /* process Unverified user object */
6291   for (sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_user, NULL);
6292        sdp != NULL;
6293        sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_user, sdp)) {
6294     if (sdp->choice != Seq_descr_user) continue;
6295     uop = (UserObjectPtr) sdp->data.ptrvalue;
6296     if (uop == NULL) continue;
6297     oip = uop->type;
6298     if (oip == NULL) continue;
6299     if (StringICmp (oip->str, "Unverified") == 0) {
6300       dlp->m_is_unverified = TRUE;
6301     } else if (StringICmp (oip->str, "AutodefOptions") == 0) {
6302       for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
6303         oip = ufp->label;
6304         if (oip == NULL) continue;
6305         if (StringICmp (oip->str, "Targeted Locus Name") != 0) continue;
6306         if (ufp->choice != 1) continue;
6307         str = (CharPtr) ufp->data.ptrvalue;
6308         if (StringHasNoText (str)) continue;
6309         dlp->m_targeted_locus = str;
6310       }
6311     }
6312   }
6313 
6314   /* process comments */
6315   for (sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_comment, NULL);
6316        sdp != NULL;
6317        sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_comment, sdp)) {
6318     if (sdp->choice != Seq_descr_comment) continue;
6319     str = (CharPtr) sdp->data.ptrvalue;
6320     if (str == NULL) continue;
6321     if (StringISearch (str, "[CAUTION] Could be the product of a pseudogene") != 0) {
6322       dlp->m_is_pseudogene = TRUE;
6323     }
6324   }
6325 
6326   /* process keywords */
6327   keywords = NULL;
6328 
6329   sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_genbank, NULL);
6330   if (sdp != NULL && sdp->choice == Seq_descr_genbank) {
6331     gbp = (GBBlockPtr) sdp->data.ptrvalue;
6332     if (gbp != NULL) {
6333       keywords = gbp->keywords;
6334     }
6335   }
6336   if (keywords == NULL) {
6337     sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_embl, NULL);
6338     if (sdp != NULL && sdp->choice == Seq_descr_embl) {
6339       ebp = (EMBLBlockPtr) sdp->data.ptrvalue;
6340       if (ebp != NULL) {
6341         keywords = ebp->keywords;
6342       }
6343     }
6344   }
6345   if (keywords != NULL) {
6346     for (vnp = keywords; vnp != NULL; vnp = vnp->next) {
6347       str = (CharPtr) vnp->data.ptrvalue;
6348       if (StringHasNoText (str)) continue;
6349       if (StringICmp (str, "UNORDERED") == 0) {
6350         dlp->m_unordered = TRUE;
6351       }
6352       if (! dlp->m_htg_tech && ! dlp->m_third_party) continue;
6353       if (StringICmp (str, "HTGS_DRAFT") == 0) {
6354         dlp->m_htgs_draft = TRUE;
6355       } else if (StringICmp (str, "HTGS_CANCELLED") == 0) {
6356         dlp->m_htgs_cancelled = TRUE;
6357       } else if (StringICmp (str, "HTGS_POOLED_MULTICLONE") == 0) {
6358         dlp->m_htgs_pooled = TRUE;
6359       } else if (StringICmp (str, "TPA:experimental") == 0) {
6360         dlp->m_tpa_exp = TRUE;
6361       } else if (StringICmp (str, "TPA:inferential") == 0) {
6362         dlp->m_tpa_inf = TRUE;
6363       } else if (StringICmp (str, "TPA:reassembly") == 0) {
6364         dlp->m_tpa_reasm = TRUE;
6365       } else if (StringICmp (str, "TPA:assembly") == 0) {
6366         dlp->m_tpa_reasm = TRUE;
6367       }
6368     }
6369   }
6370 
6371   if (dlp->m_is_pdb) {
6372 
6373     /* process PDB block */
6374     sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_pdb, NULL);
6375     if (sdp != NULL && sdp->choice == Seq_descr_pdb) {
6376       pbp = (PdbBlockPtr) sdp->data.ptrvalue;
6377       if (pbp != NULL) {
6378         compound = pbp->compound;
6379         if (compound != NULL) {
6380           dlp->m_pdb_compound = (CharPtr) compound->data.ptrvalue;
6381         }
6382       }
6383     }
6384   }
6385 
6386   if (dlp->m_is_wp) {
6387     for (sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
6388          sdp != NULL;
6389          sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, sdp)) {
6390       if (sdp->choice != Seq_descr_source) continue;
6391       biop = (BioSourcePtr) sdp->data.ptrvalue;
6392       if (biop == NULL) continue;
6393       orp = biop->org;
6394       if (orp == NULL) continue;
6395       onp = orp->orgname;
6396       if (onp == NULL) continue;
6397       if (onp->choice != 5) continue;
6398       for (tep = (TaxElementPtr) onp->data; tep != NULL; tep = tep->next) {
6399         if (tep->fixed_level == 0 && StringICmp (tep->level, "superkingdom") == 0) {
6400           num_super_kingdom++;
6401           if (dlp->m_first_super_kingdom == NULL) {
6402             dlp->m_first_super_kingdom = tep->name;
6403           } else if (StringICmp (dlp->m_first_super_kingdom, tep->name) != 0) {
6404             dlp->m_second_super_kingdom = tep->name;
6405             super_kingdoms_different = TRUE;
6406           }
6407           if (num_super_kingdom > 1 && super_kingdoms_different) {
6408             dlp->m_is_cross_kingdom = TRUE;
6409           }
6410         }
6411       }
6412     }
6413   }
6414 
6415   if (dlp->m_is_map) {
6416     for (sfp = (SeqFeatPtr) bsp->seq_ext; sfp != NULL; sfp = sfp->next) {
6417       if (sfp->data.choice != SEQFEAT_RSITE) continue;
6418       rrp = (RsiteRefPtr) sfp->data.value.ptrvalue;
6419       if (rrp == NULL) continue;
6420       if (rrp->choice == 1) {
6421         dlp->m_enzyme = (CharPtr) rrp->data.ptrvalue;
6422       }
6423     }
6424   }
6425 }
6426 
6427 /* set instance variables from BioSource */
x_SetSrcClone(SeqFeatPtr sfp,Pointer userdata)6428 static void x_SetSrcClone (
6429   SeqFeatPtr sfp,
6430   Pointer userdata
6431 )
6432 
6433 {
6434   BioSourcePtr  biop;
6435   DefLinePtr    dlp;
6436   SubSourcePtr  ssp;
6437 
6438   if (sfp == NULL || sfp->data.choice != SEQFEAT_BIOSRC) return;
6439   dlp = (DefLinePtr) userdata;
6440   if (dlp == NULL) return;
6441 
6442   biop = (BioSourcePtr) sfp->data.value.ptrvalue;
6443   if (biop == NULL) return;
6444 
6445   /* look for clones on source features */
6446   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
6447     if (StringHasNoText (ssp->name)) continue;
6448     if (ssp->subtype != SUBSRC_clone) continue;
6449     dlp->m_has_clone = TRUE;
6450   }
6451 }
6452 
x_SetBioSrc(DefLinePtr dlp)6453 static void x_SetBioSrc (
6454   DefLinePtr dlp
6455 )
6456 
6457 {
6458   BioSourcePtr   biop;
6459   BioseqPtr      bsp;
6460   Boolean        has_plasmid = FALSE, wgs_suffix = FALSE, virus_or_phage = FALSE;
6461   OrgModPtr      omp;
6462   OrgNamePtr     onp;
6463   OrgRefPtr      orp;
6464   SeqDescrPtr    sdp;
6465   SubSourcePtr   ssp;
6466   TaxElementPtr  tep;
6467 
6468   if (dlp == NULL) return;
6469 
6470   bsp = dlp->m_bioseq;
6471   if (bsp == NULL) return;
6472 
6473   sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
6474   if (sdp != NULL && sdp->choice == Seq_descr_source) {
6475     biop = (BioSourcePtr) sdp->data.ptrvalue;
6476     if (biop != NULL) {
6477       orp = biop->org;
6478       if (orp != NULL) {
6479         if (StringDoesHaveText (orp->taxname)) {
6480           dlp->m_taxname = orp->taxname;
6481         }
6482       }
6483       dlp->m_genome = biop->genome;
6484       dlp->m_is_plasmid = (Boolean) (dlp->m_genome == GENOME_plasmid);
6485       dlp->m_is_chromosome = (Boolean) (dlp->m_genome == GENOME_chromosome);
6486 
6487       /* process SubSource */
6488       for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
6489         if (StringHasNoText (ssp->name)) continue;
6490         switch (ssp->subtype) {
6491             case SUBSRC_chromosome :
6492               dlp->m_chromosome = ssp->name;
6493               break;
6494             case SUBSRC_clone :
6495               dlp->m_clone = ssp->name;
6496               dlp->m_has_clone = TRUE;
6497               break;
6498             case SUBSRC_map :
6499               dlp->m_map = ssp->name;
6500               break;
6501             case SUBSRC_plasmid_name :
6502               dlp->m_plasmid = ssp->name;
6503               break;
6504             case SUBSRC_segment :
6505               dlp->m_segment = ssp->name;
6506               break;
6507             default :
6508               break;
6509         }
6510       }
6511 
6512       /* process OrgMod */
6513       if (orp != NULL) {
6514         onp = orp->orgname;
6515         if (onp != NULL) {
6516           if (onp->choice == 5) {
6517             for (tep = (TaxElementPtr) onp->data; tep != NULL; tep = tep->next) {
6518               if (tep->fixed_level > 0) {
6519                 dlp->m_multispecies = TRUE;
6520               } else if (StringDoesHaveText (tep->level) && StringICmp (tep->level, "species") != 0) {
6521                 dlp->m_multispecies = TRUE;
6522               }
6523             }
6524           }
6525           for (omp = onp->mod; omp != NULL; omp = omp->next) {
6526             if (StringHasNoText (omp->subname)) continue;
6527             switch (omp->subtype) {
6528                 case ORGMOD_strain :
6529                   if (StringHasNoText (dlp->m_strain)) {
6530                     dlp->m_strain = omp->subname;
6531                   }
6532                   break;
6533                 case ORGMOD_cultivar :
6534                   if (StringHasNoText (dlp->m_cultivar)) {
6535                     dlp->m_cultivar = omp->subname;
6536                   }
6537                   break;
6538                 case ORGMOD_isolate :
6539                   if (StringHasNoText (dlp->m_isolate)) {
6540                     dlp->m_isolate = omp->subname;
6541                   }
6542                   break;
6543                 case ORGMOD_breed :
6544                   if (StringHasNoText (dlp->m_breed)) {
6545                     dlp->m_breed = omp->subname;
6546                   }
6547                   break;
6548                 default :
6549                   break;
6550             }
6551           }
6552         }
6553       }
6554     }
6555   }
6556 
6557   if (StringISearch (dlp->m_taxname, "virus") != NULL ||
6558       StringISearch (dlp->m_taxname, "phage") != NULL) {
6559     virus_or_phage = TRUE;
6560   }
6561 
6562   if (StringDoesHaveText (dlp->m_plasmid)) {
6563     has_plasmid = TRUE;
6564   }
6565 
6566   if (dlp->m_is_wgs) {
6567     wgs_suffix = TRUE;
6568   }
6569 
6570   dlp->m_organelle = x_OrganelleName (dlp, has_plasmid, virus_or_phage, wgs_suffix);
6571 
6572 
6573   if (dlp->m_has_clone) return;
6574 
6575   VisitFeaturesOnBsp (bsp, (Pointer) dlp, x_SetSrcClone);
6576 }
6577 
x_TrimFirstNCharacters(CharPtr str,Int2 count)6578 static CharPtr x_TrimFirstNCharacters (
6579   CharPtr str,
6580   Int2 count
6581 )
6582 
6583 {
6584   Uchar    ch;      /* to use 8bit characters in multibyte languages */
6585   CharPtr  dst;
6586   CharPtr  ptr;
6587 
6588   if (str != NULL && str [0] != '\0') {
6589     dst = str;
6590     ptr = str;
6591     ch = *ptr;
6592     while (ch != '\0' && count > 0) {
6593       count--;
6594       ptr++;
6595       ch = *ptr;
6596     }
6597     while (ch != '\0') {
6598       *dst = ch;
6599       dst++;
6600       ptr++;
6601       ch = *ptr;
6602     }
6603     *dst = '\0';
6604   }
6605   return str;
6606 }
6607 
x_TrimPunctuationFromEnd(CharPtr str)6608 static CharPtr x_TrimPunctuationFromEnd (
6609   CharPtr str
6610 )
6611 
6612 {
6613   Uchar    ch;      /* to use 8bit characters in multibyte languages */
6614   CharPtr  dst;
6615   CharPtr  ptr;
6616 
6617   if (str != NULL && str [0] != '\0') {
6618     dst = NULL;
6619     ptr = str;
6620     ch = *ptr;
6621     while (ch != '\0') {
6622       if (ch == ' ' || ch == ';' || ch == ',' || ch == '~' || ch == '.') {
6623         if (dst == NULL) {
6624           dst = ptr;
6625         }
6626       } else  {
6627         dst = NULL;
6628       }
6629       ptr++;
6630       ch = *ptr;
6631     }
6632     if (dst != NULL) {
6633       *dst = '\0';
6634     }
6635   }
6636   return str;
6637 }
6638 
x_TrimMostPunctFromEnd(CharPtr str)6639 static CharPtr x_TrimMostPunctFromEnd (
6640   CharPtr str
6641 )
6642 
6643 {
6644   Uchar    ch;      /* to use 8bit characters in multibyte languages */
6645   CharPtr  dst;
6646   CharPtr  ptr;
6647 
6648   if (str != NULL && str [0] != '\0') {
6649     dst = NULL;
6650     ptr = str;
6651     ch = *ptr;
6652     while (ch != '\0') {
6653       if (ch == ' ' || ch == ';' || ch == ',' || ch == '~') {
6654         if (dst == NULL) {
6655           dst = ptr;
6656         }
6657       } else  {
6658         dst = NULL;
6659       }
6660       ptr++;
6661       ch = *ptr;
6662     }
6663     if (dst != NULL) {
6664       *dst = '\0';
6665     }
6666   }
6667   return str;
6668 }
6669 
x_CatenateValNodeStrings(ValNodePtr list)6670 static CharPtr x_CatenateValNodeStrings (
6671   ValNodePtr list
6672 )
6673 
6674 {
6675   size_t      len;
6676   CharPtr     ptr;
6677   CharPtr     str;
6678   CharPtr     tmp;
6679   ValNodePtr  vnp;
6680 
6681 
6682   ptr = NULL;
6683   if (list != NULL) {
6684     vnp = list;
6685     len = 0;
6686     while (vnp != NULL) {
6687       if (vnp->data.ptrvalue != NULL) {
6688         len += StringLen ((CharPtr) vnp->data.ptrvalue) + 1;
6689       }
6690       vnp = vnp->next;
6691     }
6692     if (len > 0) {
6693       ptr = MemNew (sizeof (Char) * (len + 2));
6694       if (ptr != NULL) {
6695         vnp = list;
6696         tmp = ptr;
6697         while (vnp != NULL) {
6698           str = (CharPtr) vnp->data.ptrvalue;
6699           /* do not use StringDoesHaveText because generalID must be prefixed by space */
6700           if (str != NULL) {
6701             tmp = StringMove (tmp, str);
6702           }
6703           vnp = vnp->next;
6704         }
6705       }
6706     }
6707   }
6708   return ptr;
6709 }
6710 
x_DescribeClones(DefLinePtr dlp)6711 static CharPtr x_DescribeClones (
6712   DefLinePtr dlp
6713 )
6714 
6715 {
6716   Char     buf [128];
6717   Char     ch;
6718   Int4     count;
6719   size_t   len;
6720   CharPtr  result = NULL;
6721   CharPtr  str;
6722 
6723   if (dlp == NULL) return NULL;
6724 
6725   if (dlp->m_htgs_unfinished && dlp->m_htgs_pooled && dlp->m_has_clone) {
6726     result = StringSave (", pooled multiple clones");
6727     return result;
6728   }
6729 
6730   str = dlp->m_clone;
6731   if (StringHasNoText (str)) return NULL;
6732 
6733   count = 1;
6734   ch = *str;
6735   while (ch != '\0') {
6736     if (ch == ';') {
6737       count++;
6738     }
6739     str++;
6740     ch = *str;
6741   }
6742 
6743   if (count > 3) {
6744     sprintf (buf, ", %d clones", (int) count);
6745     result = StringSave (buf);
6746   } else {
6747     len = StringLen (dlp->m_clone) + 20;
6748     result = (CharPtr) MemNew (sizeof (Char) * len);
6749     if (result != NULL) {
6750       StringCat (result, " clone ");
6751       StringCat (result, dlp->m_clone);
6752     }
6753   }
6754 
6755   return result;
6756 }
6757 
x_EndsWithStrain(DefLinePtr dlp,CharPtr strain)6758 static Boolean x_EndsWithStrain (
6759   DefLinePtr dlp,
6760   CharPtr strain
6761 )
6762 
6763 {
6764   Char     ch;
6765   size_t   len;
6766   CharPtr  nxt;
6767   CharPtr  ptr;
6768 
6769   if (dlp == NULL || strain == NULL) return FALSE;
6770 
6771   len = StringLen (strain);
6772   if (len >= StringLen (dlp->m_taxname)) return FALSE;
6773 
6774   ptr = StringChr (dlp->m_taxname, ' ');
6775   if (ptr == NULL) return FALSE;
6776   ptr++;
6777   ptr = StringChr (ptr, ' ');
6778   if (ptr == NULL) return FALSE;
6779   ptr++;
6780 
6781   ptr = StringISearch (dlp->m_taxname, strain);
6782   if (ptr == NULL) return FALSE;
6783 
6784   nxt = StringISearch (ptr + 1, strain);
6785   while (nxt != NULL) {
6786     ptr = nxt;
6787     nxt = StringISearch (ptr + 1, strain);
6788   }
6789 
6790   ptr += len;
6791   if (! StringHasNoText (ptr)) {
6792     if (StringCmp (ptr, "'") == 0) {
6793       ptr -= len + 1;
6794       if (*ptr == '\'') return TRUE;
6795     }
6796     return FALSE;
6797   }
6798   ptr -= len + 1;
6799   ch = *ptr;
6800   /*
6801   if (ch == ' ' || ch == '-' || ch == '_' || ch == ':' ||
6802       ch == ';' || ch == '.' || ch == '/') {
6803      return TRUE;
6804   }
6805   */
6806   if (ispunct (ch) || isspace (ch)) {
6807     return TRUE;
6808   }
6809 
6810   return FALSE;
6811 }
6812 
x_TitleFromBioSrc(DefLinePtr dlp)6813 static CharPtr x_TitleFromBioSrc (
6814   DefLinePtr dlp
6815 )
6816 
6817 {
6818   CharPtr     result = NULL, cln, stn, ptr;
6819   ValNodePtr  strings = NULL;
6820 
6821   if (dlp == NULL) return NULL;
6822 
6823   ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6824 
6825   if (StringDoesHaveText (dlp->m_strain)) {
6826     stn = StringSave (dlp->m_strain);
6827     ptr = StringChr (stn, ';');
6828     if (ptr != NULL) {
6829       *ptr = '\0';
6830     }
6831     if (! x_EndsWithStrain (dlp, stn)) {
6832       ValNodeCopyStr (&strings, 0, " strain ");
6833       ValNodeCopyStr (&strings, 0, stn);
6834     }
6835     MemFree (stn);
6836   }
6837 
6838   if (StringDoesHaveText (dlp->m_breed)) {
6839     ValNodeCopyStr (&strings, 0, " breed ");
6840     ValNodeCopyStr (&strings, 0, dlp->m_breed);
6841   }
6842 
6843   if (StringDoesHaveText (dlp->m_cultivar)) {
6844     ValNodeCopyStr (&strings, 0, " cultivar ");
6845     ValNodeCopyStr (&strings, 0, dlp->m_cultivar);
6846   }
6847 
6848   if (StringDoesHaveText (dlp->m_isolate)) {
6849     /* x_EndsWithStrain just checks for supplied pattern, using here for isolate */
6850     if (! x_EndsWithStrain (dlp, dlp->m_isolate)) {
6851       ValNodeCopyStr (&strings, 0, " isolate ");
6852       ValNodeCopyStr (&strings, 0, dlp->m_isolate);
6853     }
6854   }
6855 
6856   if (StringDoesHaveText (dlp->m_chromosome)) {
6857     ValNodeCopyStr (&strings, 0, " chromosome ");
6858     ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
6859   }
6860 
6861   cln = x_DescribeClones (dlp);
6862   if (StringDoesHaveText (cln)) {
6863     ValNodeCopyStr (&strings, 0, cln);
6864   }
6865   MemFree (cln);
6866 
6867   if (StringDoesHaveText (dlp->m_map)) {
6868     ValNodeCopyStr (&strings, 0, " map ");
6869     ValNodeCopyStr (&strings, 0, dlp->m_map);
6870   }
6871 
6872   if (StringDoesHaveText (dlp->m_organelle)) {
6873     if (StringCmp (dlp->m_organelle, "chromosome") == 0) {
6874       /*
6875       if (StringHasNoText (dlp->m_chromosome)) {
6876         ValNodeCopyStr (&strings, 0, " ");
6877         ValNodeCopyStr (&strings, 0, dlp->m_organelle);
6878       }
6879       */
6880     } else if (StringCmp (dlp->m_organelle, "plasmid") == 0) {
6881       if (StringHasNoText (dlp->m_plasmid) && StringHasNoText (dlp->m_chromosome)) {
6882         ValNodeCopyStr (&strings, 0, " ");
6883         ValNodeCopyStr (&strings, 0, dlp->m_organelle);
6884       }
6885     } else {
6886      ValNodeCopyStr (&strings, 0, " ");
6887      ValNodeCopyStr (&strings, 0, dlp->m_organelle);
6888     }
6889   }
6890 
6891   if (StringDoesHaveText (dlp->m_plasmid)) {
6892     if (StringStr (dlp->m_plasmid, "plasmid") == NULL) {
6893       ValNodeCopyStr (&strings, 0, " plasmid ");
6894     } else {
6895       ValNodeCopyStr (&strings, 0, " ");
6896     }
6897     ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
6898   }
6899 
6900   result = x_CatenateValNodeStrings (strings);
6901   ValNodeFreeData (strings);
6902   if (result == NULL) return NULL;
6903 
6904   return result;
6905 }
6906 
x_LowercasePlasmidOrElement(CharPtr def)6907 static void x_LowercasePlasmidOrElement (
6908   CharPtr def
6909 )
6910 
6911 {
6912   CharPtr  ptr;
6913 
6914   if (StringHasNoText (def)) return;
6915 
6916   def++;
6917 
6918   ptr = StringISearch (def, "plasmid");
6919   while (ptr != NULL) {
6920     if (*ptr == 'P') {
6921       *ptr = 'p';
6922     }
6923     ptr = StringISearch (ptr + 7, "plasmid");
6924   }
6925 
6926   ptr = StringISearch (def, "element");
6927   while (ptr != NULL) {
6928     if (*ptr == 'E') {
6929       *ptr = 'e';
6930     }
6931     ptr = StringISearch (ptr + 7, "element");
6932   }
6933 }
6934 
x_TitleFromNC(DefLinePtr dlp)6935 static CharPtr x_TitleFromNC (
6936   DefLinePtr dlp
6937 )
6938 
6939 {
6940   CharPtr     completeseq = ", complete sequence";
6941   CharPtr     completegen = ", complete genome";
6942   CharPtr     result = NULL, pls_pfx = "";
6943   ValNodePtr  strings = NULL;
6944 
6945   if (dlp == NULL) return NULL;
6946 
6947   if (dlp->m_mi_biomol != MOLECULE_TYPE_GENOMIC &&
6948       dlp->m_mi_biomol != MOLECULE_TYPE_OTHER_GENETIC_MATERIAL) return NULL;
6949 
6950   if (StringHasNoText (dlp->m_taxname)) return NULL;
6951 
6952   if (dlp->m_mi_completeness == 2 ||
6953       dlp->m_mi_completeness == 3 ||
6954       dlp->m_mi_completeness == 4 ||
6955       dlp->m_mi_completeness == 5) {
6956     /* remove "complete" component */
6957     completeseq = ", partial sequence";
6958     completegen = ", genome";
6959   }
6960 
6961   if (StringDoesHaveText (dlp->m_plasmid)) {
6962     if (StringISearch (dlp->m_plasmid, "plasmid") == NULL &&
6963         StringISearch (dlp->m_plasmid, "element") == NULL) {
6964       pls_pfx = "plasmid ";
6965     }
6966   }
6967 
6968   if (StringISearch (dlp->m_taxname, "plasmid") != NULL) {
6969 
6970     ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6971     ValNodeCopyStr (&strings, 0, completeseq);
6972 
6973   } else if (dlp->m_is_plasmid) {
6974 
6975     if (StringDoesHaveText (dlp->m_plasmid)) {
6976       ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6977       ValNodeCopyStr (&strings, 0, " ");
6978       ValNodeCopyStr (&strings, 0, pls_pfx);
6979       ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
6980       ValNodeCopyStr (&strings, 0, completeseq);
6981     } else {
6982       ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6983       ValNodeCopyStr (&strings, 0, " unnamed plasmid");
6984       ValNodeCopyStr (&strings, 0, completeseq);
6985     }
6986 
6987   } else if (StringDoesHaveText (dlp->m_plasmid)) {
6988 
6989     if (StringDoesHaveText (dlp->m_organelle)) {
6990       ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6991       ValNodeCopyStr (&strings, 0, " ");
6992       ValNodeCopyStr (&strings, 0, dlp->m_organelle);
6993       ValNodeCopyStr (&strings, 0, " ");
6994       ValNodeCopyStr (&strings, 0, pls_pfx);
6995       ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
6996       ValNodeCopyStr (&strings, 0, completeseq);
6997     } else {
6998       ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6999       ValNodeCopyStr (&strings, 0, " ");
7000       ValNodeCopyStr (&strings, 0, pls_pfx);
7001       ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
7002       ValNodeCopyStr (&strings, 0, completeseq);
7003     }
7004 
7005   } else if (StringDoesHaveText (dlp->m_organelle)) {
7006 
7007     if (StringDoesHaveText (dlp->m_chromosome)) {
7008       ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7009       if (! dlp->m_is_chromosome) {
7010         ValNodeCopyStr (&strings, 0, " ");
7011         ValNodeCopyStr (&strings, 0, dlp->m_organelle);
7012       }
7013       ValNodeCopyStr (&strings, 0, " chromosome ");
7014       ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
7015       ValNodeCopyStr (&strings, 0, completeseq);
7016     } else {
7017       ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7018       switch (dlp->m_genome) {
7019         case GENOME_mitochondrion :
7020         case GENOME_chloroplast :
7021         case GENOME_kinetoplast :
7022         case GENOME_plastid :
7023         case GENOME_apicoplast :
7024           ValNodeCopyStr (&strings, 0, " ");
7025           ValNodeCopyStr (&strings, 0, dlp->m_organelle);
7026           break;
7027       }
7028       ValNodeCopyStr (&strings, 0, completegen);
7029     }
7030 
7031   } else if (StringDoesHaveText (dlp->m_segment)) {
7032 
7033     if (StringStr (dlp->m_segment, "DNA") == NULL &&
7034         StringStr (dlp->m_segment, "RNA") == NULL &&
7035         StringStr (dlp->m_segment, "segment") == NULL &&
7036         StringStr (dlp->m_segment, "Segment") == NULL) {
7037       ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7038       ValNodeCopyStr (&strings, 0, " segment ");
7039       ValNodeCopyStr (&strings, 0, dlp->m_segment);
7040       ValNodeCopyStr (&strings, 0, completegen);
7041     } else {
7042       ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7043       ValNodeCopyStr (&strings, 0, " ");
7044       ValNodeCopyStr (&strings, 0, dlp->m_segment);
7045       ValNodeCopyStr (&strings, 0, completegen);
7046     }
7047 
7048   } else if (StringDoesHaveText (dlp->m_chromosome)) {
7049 
7050     ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7051     ValNodeCopyStr (&strings, 0, " chromosome ");
7052     ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
7053     ValNodeCopyStr (&strings, 0, completegen);
7054 
7055   } else {
7056 
7057     ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7058     ValNodeCopyStr (&strings, 0, completegen);
7059   }
7060 
7061   result = x_CatenateValNodeStrings (strings);
7062   ValNodeFreeData (strings);
7063   if (result == NULL) return NULL;
7064 
7065   x_LowercasePlasmidOrElement (result);
7066 
7067   return result;
7068 }
7069 
7070 typedef struct nmfeatdata {
7071   SeqFeatPtr  gene;
7072   SeqFeatPtr  cds;
7073   Int2        numgenes;
7074   Int2        numcds;
7075   Int2        numprots;
7076 } NmFeatData, PNTR NmFeatPtr;
7077 
x_FindNMFeats(SeqFeatPtr sfp,Pointer userdata)7078 static void x_FindNMFeats (
7079   SeqFeatPtr sfp,
7080   Pointer userdata
7081 )
7082 
7083 {
7084   NmFeatPtr  nfp;
7085 
7086   if (sfp == NULL) return;
7087   nfp = (NmFeatPtr) userdata;
7088   if (nfp == NULL) return;
7089 
7090   switch (sfp->data.choice) {
7091     case SEQFEAT_GENE :
7092       nfp->gene = sfp;
7093       (nfp->numgenes)++;
7094       break;
7095     case SEQFEAT_CDREGION :
7096       nfp->cds = sfp;
7097       (nfp->numcds++);
7098       break;
7099     case SEQFEAT_PROT :
7100       (nfp->numprots)++;
7101       break;
7102     default :
7103       break;
7104   }
7105 }
7106 
x_IsFlyCG(CharPtr str)7107 static Boolean x_IsFlyCG (
7108   CharPtr str
7109 )
7110 {
7111   Char  ch;
7112 
7113   if (StringHasNoText (str)) return FALSE;
7114 
7115   ch = *str;
7116   if (ch != 'C') return FALSE;
7117 
7118   str++;
7119   ch = *str;
7120   if (ch != 'G') return FALSE;
7121 
7122   str++;
7123   ch = *str;
7124   while (IS_DIGIT (ch)) {
7125     str++;
7126     ch = *str;
7127   }
7128   if (ch != '-') return FALSE;
7129 
7130   str++;
7131   ch = *str;
7132   if (ch != 'P') return FALSE;
7133 
7134   str++;
7135   ch = *str;
7136   if (IS_ALPHA (ch)) {
7137     str++;
7138     ch = *str;
7139     if (ch == '\0' || ch == ' ' || ch == ',' || ch == ';') return TRUE;
7140   }
7141 
7142   return FALSE;
7143 }
7144 
x_FlyCG_PtoR(CharPtr str)7145 static void x_FlyCG_PtoR (
7146   CharPtr str
7147 )
7148 
7149 {
7150   Char     ch;
7151   CharPtr  ptr;
7152 
7153   while (StringDoesHaveText (str)) {
7154     ch = *str;
7155     while (IS_WHITESP (ch)) {
7156       str++;
7157       ch = *str;
7158     }
7159     if (x_IsFlyCG (str)) {
7160       ptr = StringStr (str, "-P");
7161       if (ptr != NULL) {
7162         ptr [1] = 'R';
7163         return;
7164       }
7165     }
7166     while (ch != '\0' && (! IS_WHITESP (ch))) {
7167       str++;
7168       ch = *str;
7169     }
7170   }
7171 }
7172 
x_TitleFromNM(DefLinePtr dlp)7173 static CharPtr x_TitleFromNM (
7174   DefLinePtr dlp
7175 )
7176 
7177 {
7178   Char         buf [512], buf2 [600];
7179   CharPtr      cds = NULL, gene = NULL, ptr, result = NULL;
7180   Uint2        entityID;
7181   size_t       len;
7182   NmFeatData   nfd;
7183   SeqEntryPtr  sep;
7184 
7185   if (dlp == NULL) return NULL;
7186 
7187   if (StringHasNoText (dlp->m_taxname)) return NULL;
7188 
7189   MemSet ((Pointer) &nfd, 0, sizeof (NmFeatData));
7190 
7191   entityID = ObjMgrGetEntityIDForPointer (dlp->m_bioseq);
7192   sep = GetBestTopParentForDataEx (entityID, dlp->m_bioseq, TRUE);
7193 
7194   VisitFeaturesInSep (sep, (Pointer) &nfd, x_FindNMFeats);
7195   if (nfd.numgenes != 1 || nfd.numcds != 1 || nfd.numprots < 1) return NULL;
7196 
7197   FeatDefLabel (nfd.gene, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
7198   gene = StringSaveNoNull (buf);
7199 
7200   FeatDefLabel (nfd.cds, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
7201 
7202   /* special case Drosophila RefSeq NM titles */
7203   if (StringICmp (dlp->m_taxname, "Drosophila melanogaster") == 0) {
7204     x_FlyCG_PtoR (buf);
7205   }
7206   ptr = StringStr (buf, "isoform ");
7207   if (ptr != NULL) {
7208     *ptr = '\0';
7209     ptr += 8;
7210     StringCpy (buf2, buf);
7211     StringCat (buf2, "transcript variant ");
7212     StringCat (buf2, ptr);
7213     cds = StringSaveNoNull (buf2);
7214   } else {
7215     cds = StringSaveNoNull (buf);
7216   }
7217 
7218   len = StringLen (dlp->m_taxname) + StringLen (cds) +
7219         StringLen (gene) + StringLen ("  (), mRNA") + 10;
7220 
7221   result = (CharPtr) MemNew (sizeof (Char) * len);
7222 
7223   if (result != NULL) {
7224     sprintf (result, "%s %s (%s), mRNA", dlp->m_taxname, cds, gene);
7225   }
7226 
7227   MemFree (gene);
7228   MemFree (cds);
7229 
7230   return result;
7231 }
7232 
x_TitleFromNR(DefLinePtr dlp)7233 static CharPtr x_TitleFromNR (
7234   DefLinePtr dlp
7235 )
7236 
7237 {
7238   Char         buf [512];
7239   Uint2        entityID;
7240   CharPtr      gene = NULL,  rna = "miscRNA", result = NULL;
7241   size_t       len;
7242   NmFeatData   nfd;
7243   SeqEntryPtr  sep;
7244 
7245   if (dlp == NULL) return NULL;
7246 
7247   if (StringHasNoText (dlp->m_taxname)) return NULL;
7248 
7249   MemSet ((Pointer) &nfd, 0, sizeof (NmFeatData));
7250 
7251   entityID = ObjMgrGetEntityIDForPointer (dlp->m_bioseq);
7252   sep = GetBestTopParentForDataEx (entityID, dlp->m_bioseq, TRUE);
7253 
7254   VisitFeaturesInSep (sep, (Pointer) &nfd, x_FindNMFeats);
7255   if (nfd.numgenes < 1) return NULL;
7256 
7257   FeatDefLabel (nfd.gene, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
7258   gene = StringSaveNoNull (buf);
7259 
7260   switch (dlp->m_mi_biomol) {
7261       case MOLECULE_TYPE_PRE_MRNA :
7262         rna = "precursorRNA";
7263         break;
7264       case MOLECULE_TYPE_MRNA :
7265         rna = "mRNA";
7266         break;
7267       case MOLECULE_TYPE_RRNA :
7268         rna = "rRNA";
7269         break;
7270       case MOLECULE_TYPE_TRNA :
7271          rna = "tRNA";
7272         break;
7273       case MOLECULE_TYPE_SNRNA :
7274         rna = "snRNA";
7275         break;
7276       case MOLECULE_TYPE_SCRNA :
7277         rna = "scRNA";
7278         break;
7279       case MOLECULE_TYPE_CRNA :
7280         rna = "cRNA";
7281         break;
7282       case MOLECULE_TYPE_SNORNA :
7283         rna = "snoRNA";
7284         break;
7285       case MOLECULE_TYPE_TRANSCRIBED_RNA :
7286         rna = "miscRNA";
7287         break;
7288       case MOLECULE_TYPE_NCRNA :
7289         rna = "ncRNA";
7290         break;
7291       case MOLECULE_TYPE_TMRNA :
7292         rna = "tmRNA";
7293         break;
7294       default :
7295         break;
7296   }
7297 
7298   len = StringLen (dlp->m_taxname) + StringLen (gene) +
7299         StringLen (", ") + 30;
7300 
7301   result = (CharPtr) MemNew (sizeof (Char) * len);
7302   if (result != NULL) {
7303     sprintf (result, "%s %s, %s", dlp->m_taxname, gene, rna);
7304   }
7305 
7306   MemFree (gene);
7307 
7308   return result;
7309 }
7310 
x_TitleFromPatent(DefLinePtr dlp)7311 static CharPtr x_TitleFromPatent (
7312   DefLinePtr dlp
7313 )
7314 
7315 {
7316   Char  buf [128];
7317 
7318   if (dlp == NULL) return NULL;
7319 
7320   sprintf (buf, "Sequence %d from Patent %s %s",
7321            (int) dlp->m_patent_sequence,
7322            dlp->m_patent_country,
7323            dlp->m_patent_number);
7324 
7325   return StringSave (buf);
7326 }
7327 
x_TitleFromPDB(DefLinePtr dlp)7328 static CharPtr x_TitleFromPDB (
7329   DefLinePtr dlp
7330 )
7331 
7332 {
7333   Char        buf [128];
7334   Char        ch;
7335   CharPtr     result = NULL;
7336   ValNodePtr  strings = NULL;
7337 
7338   if (dlp == NULL) return NULL;
7339 
7340   ch = dlp->m_pdb_chain;
7341   if (IS_PRINT (ch)) {
7342     sprintf (buf, "Chain %c, ", ch);
7343     ValNodeCopyStr (&strings, 0, buf);
7344   }
7345   ValNodeCopyStr (&strings, 0, dlp->m_pdb_compound);
7346 
7347   result = x_CatenateValNodeStrings (strings);
7348   ValNodeFreeData (strings);
7349 
7350   return result;
7351 }
7352 
x_TitleFromGPipe(DefLinePtr dlp)7353 static CharPtr x_TitleFromGPipe (
7354   DefLinePtr dlp
7355 )
7356 
7357 {
7358   CharPtr     result = NULL, cln, stn, ptr;
7359   ValNodePtr  strings = NULL;
7360 
7361   if (dlp == NULL) return NULL;
7362 
7363   ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7364 
7365   if (StringDoesHaveText (dlp->m_organelle) && StringICmp (dlp->m_organelle, "plasmid") != 0) {
7366     ValNodeCopyStr (&strings, 0, " ");
7367     ValNodeCopyStr (&strings, 0, dlp->m_organelle);
7368   }
7369 
7370   if (StringDoesHaveText (dlp->m_strain)) {
7371     stn = StringSave (dlp->m_strain);
7372     ptr = StringChr (stn, ';');
7373     if (ptr != NULL) {
7374       *ptr = '\0';
7375     }
7376     if (! x_EndsWithStrain (dlp, stn)) {
7377       ValNodeCopyStr (&strings, 0, " strain ");
7378       ValNodeCopyStr (&strings, 0, stn);
7379     }
7380     MemFree (stn);
7381   }
7382 
7383   if (StringDoesHaveText (dlp->m_chromosome)) {
7384     ValNodeCopyStr (&strings, 0, " chromosome ");
7385     ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
7386   }
7387 
7388   cln = x_DescribeClones (dlp);
7389   if (StringDoesHaveText (cln)) {
7390     ValNodeCopyStr (&strings, 0, cln);
7391   }
7392   MemFree (cln);
7393 
7394   if (StringDoesHaveText (dlp->m_map)) {
7395     ValNodeCopyStr (&strings, 0, " map ");
7396     ValNodeCopyStr (&strings, 0, dlp->m_map);
7397   }
7398 
7399   if (StringDoesHaveText (dlp->m_plasmid)) {
7400     ValNodeCopyStr (&strings, 0, " plasmid ");
7401     ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
7402   }
7403 
7404   if (dlp->m_mi_completeness == 1) {
7405     ValNodeCopyStr (&strings, 0, ", complete sequence");
7406   }
7407 
7408   result = x_CatenateValNodeStrings (strings);
7409   ValNodeFreeData (strings);
7410   if (result == NULL) return NULL;
7411 
7412   return result;
7413 }
7414 
7415 typedef struct udxfeatdata {
7416   SeqIdPtr    bspid;
7417   Int4        longest;
7418   Uint1       processed;
7419   SeqFeatPtr  sfp;
7420 } UdxFeatData, PNTR UdxFeatPtr;
7421 
x_GetLongestProtFeat(SeqFeatPtr sfp,Pointer userdata)7422 static void x_GetLongestProtFeat (
7423   SeqFeatPtr sfp,
7424   Pointer userdata
7425 )
7426 
7427 {
7428   Int4        len;
7429   ProtRefPtr  prp;
7430   SeqIdPtr    sip;
7431   UdxFeatPtr  ufp;
7432 
7433   if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT) return;
7434   prp = (ProtRefPtr) sfp->data.value.ptrvalue;
7435   if (prp == NULL) return;
7436 
7437   ufp = (UdxFeatPtr) userdata;
7438   if (ufp == NULL) return;
7439 
7440   sip = SeqLocId (sfp->location);
7441   if (sip == NULL) return;
7442 
7443   if (! SeqIdIn (sip, ufp->bspid)) return;
7444   len = SeqLocLen (sfp->location);
7445   if (len == -1) return;
7446 
7447   if (len > ufp->longest) {
7448     ufp->sfp = sfp;
7449     ufp->longest = len;
7450     ufp->processed = prp->processed;
7451   } else if (len == ufp->longest) {
7452     /* unprocessed 0 preferred over preprotein 1 preferred over mat peptide 2 */
7453     if (prp->processed < ufp->processed) {
7454       ufp->sfp = sfp;
7455       ufp->longest = len;
7456       ufp->processed = prp->processed;
7457     }
7458   }
7459 }
7460 
x_GetLongestProteinUnindexed(BioseqPtr bsp)7461 static SeqFeatPtr x_GetLongestProteinUnindexed (
7462   BioseqPtr bsp
7463 )
7464 
7465 {
7466   BioseqSetPtr  bssp = NULL;
7467   UdxFeatData   ufd;
7468 
7469   if (bsp == NULL) return NULL;
7470 
7471   MemSet ((Pointer) &ufd, 0, sizeof (UdxFeatData));
7472   ufd.bspid = bsp->id;
7473   ufd.longest = 0;
7474   ufd.sfp = NULL;
7475 
7476   VisitFeaturesOnBsp (bsp, (Pointer) &ufd, x_GetLongestProtFeat);
7477 
7478   if (ufd.sfp != NULL && ufd.longest == bsp->length) return ufd.sfp;
7479 
7480   if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
7481     bssp = (BioseqSetPtr) bsp->idx.parentptr;
7482   }
7483 
7484   if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) {
7485     VisitFeaturesOnSet (bssp, (Pointer) &ufd, x_GetLongestProtFeat);
7486 
7487     if (bssp->idx.parenttype == OBJ_BIOSEQSET) {
7488       bssp = (BioseqSetPtr) bssp->idx.parentptr;
7489     }
7490   }
7491 
7492   if (bssp != NULL && bssp->_class == BioseqseqSet_class_segset) {
7493     VisitFeaturesOnSet (bssp, (Pointer) &ufd, x_GetLongestProtFeat);
7494   }
7495 
7496   return ufd.sfp;
7497 }
7498 
x_NotSpecialTaxName(CharPtr taxname)7499 static Boolean x_NotSpecialTaxName (
7500   CharPtr taxname
7501 )
7502 
7503 {
7504   if (StringHasNoText (taxname)) return TRUE;
7505 
7506   if (StringICmp (taxname, "synthetic construct") == 0) return FALSE;
7507   if (StringICmp (taxname, "artificial sequence") == 0) return FALSE;
7508   if (StringStr (taxname, "vector") != NULL) return FALSE;
7509   if (StringStr (taxname, "Vector") != NULL) return FALSE;
7510 
7511   return TRUE;
7512 }
7513 
7514 /*
7515 static CharPtr proteinOrganellePrefix [] = {
7516   NULL,
7517   NULL,
7518   "chloroplast",
7519   "chromoplast",
7520   "kinetoplast",
7521   "mitochondrion",
7522   "plastid",
7523   "macronuclear",
7524   "extrachromosomal",
7525   "plasmid",
7526   NULL,
7527   NULL,
7528   "cyanelle",
7529   "proviral",
7530   "virus",
7531   "nucleomorph",
7532   "apicoplast",
7533   "leucoplast",
7534   "protoplast",
7535   "endogenous virus",
7536   "hydrogenosome",
7537   "chromosome",
7538   "chromatophore"
7539 };
7540 */
7541 
7542 static CharPtr proteinOrganellePrefix [] = {
7543   NULL,
7544   NULL,
7545   "chloroplast",
7546   "chromoplast",
7547   "kinetoplast",
7548   "mitochondrion",
7549   "plastid",
7550   "macronuclear",
7551   NULL,
7552   "plasmid",
7553   NULL,
7554   NULL,
7555   "cyanelle",
7556   NULL,
7557   NULL,
7558   "nucleomorph",
7559   "apicoplast",
7560   "leucoplast",
7561   "protoplast",
7562   "endogenous virus",
7563   "hydrogenosome",
7564   NULL,
7565   "chromatophore"
7566 };
7567 
x_TitleFromProtein(DefLinePtr dlp)7568 static CharPtr x_TitleFromProtein (
7569   DefLinePtr dlp
7570 )
7571 
7572 {
7573   BioSourcePtr       biop;
7574   BioseqPtr          bsp;
7575   SeqFeatPtr         cds = NULL;
7576   Char               ch;
7577   CharPtr            comma = NULL;
7578   Uint2              entityID;
7579   SeqMgrFeatContext  fcontext;
7580   GeneRefPtr         grp;
7581   Boolean            indexed;
7582   CharPtr            isoform = NULL;
7583   size_t             len;
7584   CharPtr            low_qual = "LOW QUALITY PROTEIN: ";
7585   Int2               offset = 0;
7586   CharPtr            organelle = NULL;
7587   OrgRefPtr          orp;
7588   Boolean            partial = FALSE;
7589   CharPtr            prefix = "";
7590   ProtRefPtr         prp;
7591   CharPtr            ptr;
7592   CharPtr            result = NULL;
7593   SeqFeatPtr         sfp = NULL;
7594   SeqIntPtr          sintp;
7595   SeqLocPtr          slp, slpx;
7596   SeqPntPtr          spp;
7597   CharPtr            str;
7598   ValNodePtr         strings = NULL;
7599   CharPtr            taxname = NULL;
7600   CharPtr            title = NULL;
7601   CharPtr            tmp;
7602   ValNodePtr         vnp;
7603 
7604   if (dlp == NULL) return NULL;
7605 
7606   bsp = dlp->m_bioseq;
7607   if (bsp == NULL) return NULL;
7608 
7609   entityID = ObjMgrGetEntityIDForPointer (bsp);
7610   indexed = (Boolean) (SeqMgrFeaturesAreIndexed (entityID) != 0);
7611 
7612   if (indexed) {
7613     sfp = SeqMgrGetBestProteinFeature (bsp, NULL);
7614   } else {
7615     if (dlp->m_is_seg) {
7616       SeqMgrIndexFeatures (entityID, NULL);
7617       indexed = TRUE;
7618       sfp = SeqMgrGetBestProteinFeature (bsp, NULL);
7619     } else {
7620       sfp = x_GetLongestProteinUnindexed (bsp);
7621     }
7622   }
7623 
7624   if (dlp->m_mi_completeness > 1 && dlp->m_mi_completeness < 6) {
7625     partial = TRUE;
7626   }
7627 
7628   if (sfp != NULL) {
7629     prp = (ProtRefPtr) sfp->data.value.ptrvalue;
7630     if (prp != NULL) {
7631       if (prp->name != NULL) {
7632         if (dlp->m_allprotnames) {
7633           for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
7634             str = (CharPtr) vnp->data.ptrvalue;
7635             ValNodeCopyStr (&strings, 0, prefix);
7636             ValNodeCopyStr (&strings, 0, str);
7637             prefix = "; ";
7638           }
7639           title = x_CatenateValNodeStrings (strings);
7640           strings = ValNodeFreeData (strings);
7641         } else {
7642           vnp = prp->name;
7643           /* although vnp should not be NULL, a compiler/optimizer bug might let it, so check again */
7644           if (vnp != NULL && vnp->data.ptrvalue != NULL) {
7645             str = (CharPtr) vnp->data.ptrvalue;
7646             title = StringSave (str);
7647           }
7648         }
7649         x_TrimPunctuationFromEnd (title);
7650 
7651         /* if hypothetical protein, append locus_tag */
7652         offset = 0;
7653         if (StringNICmp (title, "hypothetical protein", 20) == 0) {
7654           offset = 20;
7655         } else if (StringNICmp (title, "uncharacterized protein", 23) == 0) {
7656           offset = 23;
7657         }
7658         if (offset > 0) {
7659           ptr = title + offset;
7660           if (ptr [0] == ',' && ptr [1] == ' ') {
7661             comma = ",";
7662             ptr += 2;
7663           }
7664           if (ptr [0] == ' ') {
7665             ptr++;
7666           }
7667           if (StringNCmp (ptr, "isoform ", 8) == 0) {
7668             ptr += 8;
7669             isoform = ptr;
7670             ch = *ptr;
7671             while (ch != '\0' && IS_ALPHANUM (ch)) {
7672               ptr++;
7673               ch = *ptr;
7674             }
7675             if (ch != '\0') {
7676               isoform = NULL;
7677             } else {
7678               title [offset] = '\0';
7679             }
7680           }
7681         }
7682         if (StringICmp (title, "hypothetical protein") == 0 || StringICmp (title, "uncharacterized protein") == 0) {
7683           if (! indexed) {
7684             SeqMgrIndexFeatures (entityID, NULL);
7685             indexed = TRUE;
7686           }
7687           if (cds == NULL) {
7688             cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7689           }
7690           if (cds != NULL) {
7691             grp = SeqMgrGetGeneXref (cds);
7692             if (grp == NULL) {
7693               sfp = SeqMgrGetOverlappingFeature (cds->location, FEATDEF_GENE, NULL, 0, NULL, LOCATION_SUBSET, NULL);
7694               if (sfp != NULL) {
7695                 grp = (GeneRefPtr) sfp->data.value.ptrvalue;
7696               }
7697             }
7698             if (grp != NULL) {
7699               if (grp->locus_tag != NULL) {
7700                 len = StringLen (title) + StringLen (grp->locus_tag) + StringLen (isoform) + 35;
7701                 str = (CharPtr) MemNew (sizeof (Char) * len);
7702                 if (str != NULL) {
7703                   StringCat (str, title);
7704                   StringCat (str, " ");
7705                   StringCat (str, grp->locus_tag);
7706                   if (StringDoesHaveText (isoform)) {
7707                     if (comma != NULL) {
7708                       StringCat (str, comma);
7709                     }
7710                     StringCat (str, " isoform ");
7711                     StringCat (str, isoform);
7712                   }
7713                   MemFree (title);
7714                   title = str;
7715                 }
7716               }
7717             }
7718           }
7719         }
7720       }
7721 
7722       if ( title == NULL && prp->desc != NULL) {
7723         title = StringSave (prp->desc);
7724       }
7725 
7726       if ( title == NULL && prp->activity != NULL) {
7727         vnp = prp->activity;
7728         str = (CharPtr) vnp->data.ptrvalue;
7729         title = StringSave (str);
7730       }
7731     }
7732   }
7733 
7734   if (title == NULL) {
7735     if (! indexed) {
7736       SeqMgrIndexFeatures (entityID, NULL);
7737       indexed = TRUE;
7738     }
7739     if (cds == NULL) {
7740       cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7741     }
7742     if (cds != NULL) {
7743       grp = SeqMgrGetGeneXref (cds);
7744       if (grp == NULL) {
7745         sfp = SeqMgrGetOverlappingFeature (cds->location, FEATDEF_GENE, NULL, 0, NULL, LOCATION_SUBSET, NULL);
7746         if (sfp != NULL) {
7747           grp = (GeneRefPtr) sfp->data.value.ptrvalue;
7748         }
7749       }
7750       if (grp != NULL) {
7751         str = NULL;
7752         if (grp->locus != NULL) {
7753           str = grp->locus;
7754         } else if (grp->syn != NULL) {
7755           vnp = grp->syn;
7756           str = (CharPtr) vnp->data.ptrvalue;
7757         } else if (grp->desc != NULL) {
7758           str = grp->desc;
7759         }
7760         if (StringDoesHaveText (str)) {
7761           ValNodeCopyStr (&strings, 0, str);
7762           ValNodeCopyStr (&strings, 0, " gene product");
7763           title = x_CatenateValNodeStrings (strings);
7764           strings = ValNodeFreeData (strings);
7765         }
7766       }
7767     }
7768   }
7769 
7770   if (title == NULL) {
7771     title = StringSave ("unnamed protein product");
7772     if (! indexed) {
7773       SeqMgrIndexFeatures (entityID, NULL);
7774       indexed = TRUE;
7775     }
7776     if (cds == NULL) {
7777       cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7778     }
7779     if (cds != NULL) {
7780       grp = SeqMgrGetGeneXref (cds);
7781       if (grp == NULL) {
7782         sfp = SeqMgrGetOverlappingFeature (cds->location, FEATDEF_GENE, NULL, 0, NULL, LOCATION_SUBSET, NULL);
7783         if (sfp != NULL) {
7784           grp = (GeneRefPtr) sfp->data.value.ptrvalue;
7785         }
7786       }
7787       if (grp != NULL) {
7788         if (grp->locus_tag != NULL) {
7789           len = StringLen (title) + StringLen (grp->locus_tag) + 20;
7790           str = (CharPtr) MemNew (sizeof (Char) * len);
7791           if (str != NULL) {
7792             StringCat (str, title);
7793             StringCat (str, " ");
7794             StringCat (str, grp->locus_tag);
7795             MemFree (title);
7796             title = str;
7797           }
7798         }
7799       }
7800     }
7801   }
7802 
7803   if (title != NULL) {
7804     x_TrimPunctuationFromEnd (title);
7805   }
7806 
7807   taxname = dlp->m_taxname;
7808   if (StringHasNoText (taxname) || x_NotSpecialTaxName (taxname)) {
7809     if (! indexed) {
7810       SeqMgrIndexFeatures (entityID, NULL);
7811       indexed = TRUE;
7812     }
7813     if (cds == NULL) {
7814       cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7815     }
7816     if (cds != NULL) {
7817       slp = AsnIoMemCopy ((Pointer) cds->location, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
7818       if (slp != NULL) {
7819         for (slpx = SeqLocFindNext (slp, NULL); slpx != NULL; slpx = SeqLocFindNext (slp, slpx)) {
7820           if (slpx->choice == SEQLOC_INT) {
7821             sintp = (SeqIntPtr) slpx->data.ptrvalue;
7822             if (sintp != NULL) {
7823               sintp->strand = Seq_strand_both;
7824             }
7825           } else if (slpx->choice == SEQLOC_PNT) {
7826             spp = (SeqPntPtr) slpx->data.ptrvalue;
7827             if (spp != NULL) {
7828               spp->strand = Seq_strand_both;
7829             }
7830           }
7831         }
7832         /*
7833         sfp = SeqMgrGetOverlappingSource (slp, &fcontext);
7834         */
7835         sfp = SeqMgrGetOverlappingFeature (slp, FEATDEF_BIOSRC, NULL, 0, NULL, LOCATION_SUBSET, &fcontext);
7836         if (sfp != NULL) {
7837           biop = (BioSourcePtr) sfp->data.value.ptrvalue;
7838           if (biop != NULL) {
7839             orp = biop->org;
7840             if (orp != NULL) {
7841               taxname = orp->taxname;
7842             }
7843           }
7844         }
7845         SeqLocFree (slp);
7846       }
7847     }
7848   }
7849 
7850   if (dlp->m_genome >= GENOME_chloroplast && dlp->m_genome <= GENOME_chromatophore) {
7851     organelle = proteinOrganellePrefix [dlp->m_genome];
7852     /*
7853     if (StringNICmp (organelle, taxname, StringLen (organelle)) == 0) {
7854       organelle = NULL;
7855     }
7856     */
7857   }
7858 
7859   if (cds == NULL) {
7860     if (! indexed) {
7861       SeqMgrIndexFeatures (entityID, NULL);
7862       indexed = TRUE;
7863     }
7864     cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7865   }
7866   if (cds != NULL) {
7867     if (x_CDShasLowQualityException (dlp, cds)) {
7868       if (StringStr (title, low_qual) == NULL) {
7869         len = StringLen (title) + StringLen (low_qual) + 6;
7870         tmp = (CharPtr) MemNew (sizeof (Char) * len);
7871         if (tmp != NULL) {
7872           StringCat (tmp, low_qual);
7873           StringCat (tmp, title);
7874           MemFree (title);
7875           title = tmp;
7876         }
7877       }
7878     }
7879   }
7880 
7881   if (partial) {
7882     len = StringLen (title) + 12;
7883     tmp = (CharPtr) MemNew (sizeof (Char) * len);
7884     if (tmp != NULL) {
7885       StringCat (tmp, title);
7886       StringCat (tmp, ", partial");
7887       MemFree (title);
7888       title = tmp;
7889     }
7890   }
7891   if (StringDoesHaveText (organelle)) {
7892     len = StringLen (title) + StringLen (organelle) + 6;
7893     tmp = (CharPtr) MemNew (sizeof (Char) * len);
7894     if (tmp != NULL) {
7895       StringCat (tmp, title);
7896       StringCat (tmp, " (");
7897       StringCat (tmp, organelle);
7898       StringCat (tmp, ")");
7899       MemFree (title);
7900       title = tmp;
7901     }
7902   }
7903 
7904   if (dlp->m_is_cross_kingdom && StringDoesHaveText (dlp->m_first_super_kingdom) && StringDoesHaveText (dlp->m_second_super_kingdom)) {
7905     len = StringLen (title) + StringLen (dlp->m_first_super_kingdom) + StringLen (dlp->m_second_super_kingdom) + 8;
7906     tmp = (CharPtr) MemNew (sizeof (Char) * len);
7907     if (tmp != NULL) {
7908       StringCat (tmp, title);
7909       StringCat (tmp, " [");
7910       StringCat (tmp, dlp->m_first_super_kingdom);
7911       StringCat (tmp, "][");
7912       StringCat (tmp, dlp->m_second_super_kingdom);
7913       StringCat (tmp, "]");
7914       MemFree (title);
7915       title = tmp;
7916     }
7917   } else if (StringDoesHaveText (taxname)) {
7918     len = StringLen (title) + StringLen (taxname) + 6;
7919     tmp = (CharPtr) MemNew (sizeof (Char) * len);
7920     if (tmp != NULL) {
7921       StringCat (tmp, title);
7922       StringCat (tmp, " [");
7923       StringCat (tmp, taxname);
7924       StringCat (tmp, "]");
7925       MemFree (title);
7926       title = tmp;
7927     }
7928   }
7929 
7930   if (result == NULL) {
7931     result = StringSave (title);
7932   }
7933 
7934   MemFree (title);
7935 
7936   return result;
7937 }
7938 
x_TitleFromSegSeq(DefLinePtr dlp)7939 static CharPtr x_TitleFromSegSeq (
7940   DefLinePtr dlp
7941 )
7942 
7943 {
7944   BioseqPtr          bsp;
7945   SeqMgrFeatContext  ccontext;
7946   SeqFeatPtr         cds;
7947   CharPtr            cln = NULL;
7948   CharPtr            complete = "gene, complete cds";
7949   Uint2              entityID;
7950   SeqMgrFeatContext  gcontext;
7951   SeqFeatPtr         gene;
7952   GeneRefPtr         grp;
7953   CharPtr            label = NULL;
7954   size_t             len;
7955   CharPtr            locus = NULL;
7956   CharPtr            modifier = NULL;
7957   CharPtr            product = NULL;
7958   CharPtr            result = NULL;
7959   CharPtr            str;
7960   CharPtr            taxname = NULL;
7961   ValNodePtr         vnp;
7962 
7963   if (dlp == NULL) return NULL;
7964 
7965   bsp = dlp->m_bioseq;
7966   if (bsp == NULL) return NULL;
7967 
7968   entityID = ObjMgrGetEntityIDForPointer (bsp);
7969   if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
7970     SeqMgrIndexFeatures (entityID, NULL);
7971   }
7972 
7973   cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext);
7974 
7975   if (cds != NULL) {
7976     if (cds->partial) {
7977       complete = "gene, partial cds";
7978     }
7979     product = ccontext.label;
7980     grp = SeqMgrGetGeneXref (cds);
7981     if (grp != NULL) {
7982       if (StringDoesHaveText (grp->locus)) {
7983         locus = grp->locus;
7984       } else {
7985         vnp = grp->syn;
7986         if (vnp != NULL) {
7987           str = (CharPtr) vnp->data.ptrvalue;
7988           if (StringDoesHaveText (str)) {
7989             locus = str;
7990           }
7991         }
7992       }
7993     }
7994     if (locus == NULL) {
7995       gene = SeqMgrGetOverlappingGene (cds->location, &gcontext);
7996       if (gene != NULL) {
7997         locus = gcontext.label;
7998       }
7999     }
8000   } else {
8001     if (StringDoesHaveText (dlp->m_strain) && (! x_EndsWithStrain (dlp, dlp->m_strain))) {
8002       modifier = dlp->m_strain;
8003       label = " strain ";
8004     } else if (StringDoesHaveText (dlp->m_clone)) {
8005       cln = x_DescribeClones (dlp);
8006       modifier = cln;
8007     } else if (StringDoesHaveText (dlp->m_isolate)) {
8008       modifier = dlp->m_isolate;
8009       label = " isolate ";
8010     }
8011   }
8012 
8013   taxname = dlp->m_taxname;
8014   if (StringHasNoText (taxname)) {
8015     taxname = "Unknown";
8016   }
8017 
8018   len = StringLen (taxname) + StringLen (label) + StringLen (modifier) +
8019         StringLen (product) + StringLen (locus) + StringLen (complete) + 10;
8020 
8021   result = (CharPtr) MemNew (sizeof (Char) * len);
8022   if (result == NULL) {
8023     MemFree (cln);
8024     return NULL;
8025   }
8026 
8027   if (taxname != NULL) {
8028     StringCat (result, taxname);
8029   }
8030 
8031   if (modifier != NULL) {
8032     if (label != NULL) {
8033       StringCat (result, label);
8034     }
8035     StringCat (result, modifier);
8036   }
8037 
8038   if (product != NULL) {
8039     StringCat (result, " ");
8040     StringCat (result, product);
8041   }
8042   if (locus != NULL) {
8043     StringCat (result, " (");
8044     StringCat (result, locus);
8045     StringCat (result, ")");
8046   }
8047   if (product != NULL || locus != NULL) {
8048     StringCat (result, " ");
8049     StringCat (result, complete);
8050   }
8051   TrimSpacesAroundString (result);
8052 
8053   MemFree (cln);
8054 
8055   return result;
8056 }
8057 
x_StringInList(ValNodePtr strings,CharPtr str)8058  static Boolean x_StringInList (
8059   ValNodePtr strings,
8060   CharPtr str
8061 )
8062 
8063 {
8064   CharPtr     tmp;
8065   ValNodePtr  vnp;
8066 
8067   if (strings == NULL || StringHasNoText (str)) return FALSE;
8068 
8069   for (vnp = strings; vnp != NULL; vnp = vnp->next) {
8070     tmp = (CharPtr) vnp->data.ptrvalue;
8071     if (StringStr (tmp, str) != NULL) return TRUE;
8072   }
8073 
8074   return FALSE;
8075 }
8076 
8077 
x_TitleFromWGS(DefLinePtr dlp)8078 static CharPtr x_TitleFromWGS (
8079   DefLinePtr dlp
8080 )
8081 
8082 {
8083   CharPtr     result = NULL, cln, mod, ptr;
8084   ValNodePtr  strings = NULL;
8085 
8086   if (dlp == NULL) return NULL;
8087 
8088   ValNodeCopyStr (&strings, 0, dlp->m_taxname);
8089 
8090   if (StringDoesHaveText (dlp->m_strain)) {
8091     mod = StringSave (dlp->m_strain);
8092     ptr = StringChr (mod, ';');
8093     if (ptr != NULL) {
8094       *ptr = '\0';
8095     }
8096     if (! x_EndsWithStrain (dlp, mod)) {
8097       ValNodeCopyStr (&strings, 0, " strain ");
8098       ValNodeCopyStr (&strings, 0, mod);
8099     }
8100     MemFree (mod);
8101   } else if (StringDoesHaveText (dlp->m_breed)) {
8102     ValNodeCopyStr (&strings, 0, " breed ");
8103     mod = StringSave (dlp->m_breed);
8104     ptr = StringChr (mod, ';');
8105     if (ptr != NULL) {
8106       *ptr = '\0';
8107     }
8108     ValNodeCopyStr (&strings, 0, mod);
8109     MemFree (mod);
8110   } else if (StringDoesHaveText (dlp->m_cultivar)) {
8111     ValNodeCopyStr (&strings, 0, " cultivar ");
8112     mod = StringSave (dlp->m_cultivar);
8113     ptr = StringChr (mod, ';');
8114     if (ptr != NULL) {
8115       *ptr = '\0';
8116     }
8117     ValNodeCopyStr (&strings, 0, mod);
8118     MemFree (mod);
8119   }
8120 
8121   if (StringDoesHaveText (dlp->m_isolate)) {
8122     /* x_EndsWithStrain just checks for supplied pattern, using here for isolate */
8123     if (! x_EndsWithStrain (dlp, dlp->m_isolate)) {
8124       ValNodeCopyStr (&strings, 0, " isolate ");
8125       ValNodeCopyStr (&strings, 0, dlp->m_isolate);
8126     }
8127   }
8128 
8129   if (StringDoesHaveText (dlp->m_chromosome)) {
8130     ValNodeCopyStr (&strings, 0, " chromosome ");
8131     ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
8132   }
8133 
8134   cln = x_DescribeClones (dlp);
8135   if (StringDoesHaveText (cln)) {
8136     ValNodeCopyStr (&strings, 0, cln);
8137   }
8138   MemFree (cln);
8139 
8140   if (StringDoesHaveText (dlp->m_map)) {
8141     ValNodeCopyStr (&strings, 0, " map ");
8142     ValNodeCopyStr (&strings, 0, dlp->m_map);
8143   }
8144 
8145   if (StringDoesHaveText (dlp->m_plasmid)) {
8146     if (dlp->m_is_wgs) {
8147       ValNodeCopyStr (&strings, 0, " plasmid ");
8148       ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
8149     }
8150   }
8151 
8152   if (dlp->m_genome == GENOME_plasmid && dlp->m_topology == TOPOLOGY_CIRCULAR) {
8153   } else if (dlp->m_genome == GENOME_chromosome) {
8154   } else if (StringDoesHaveText (dlp->m_general_str) && StringICmp (dlp->m_general_str, dlp->m_chromosome) != 0) {
8155     ValNodeCopyStr (&strings, 0, " ");
8156     ValNodeCopyStr (&strings, 0, dlp->m_general_str);
8157   }
8158 
8159   result = x_CatenateValNodeStrings (strings);
8160   ValNodeFreeData (strings);
8161   if (result == NULL) return NULL;
8162 
8163   return result;
8164 }
8165 
x_TitleFromMap(DefLinePtr dlp)8166 static CharPtr x_TitleFromMap (
8167   DefLinePtr dlp
8168 )
8169 
8170 {
8171   BioseqPtr   bsp;
8172   CharPtr     result = NULL, mod, ptr;
8173   ValNodePtr  strings = NULL;
8174 
8175   if (dlp == NULL) return NULL;
8176 
8177   bsp = dlp->m_bioseq;
8178   if (bsp == NULL) return NULL;
8179   if (bsp->seq_ext_type != 3) return NULL;
8180   if (bsp->seq_ext == NULL) return NULL;
8181 
8182   ValNodeCopyStr (&strings, 0, dlp->m_taxname);
8183 
8184   if (StringDoesHaveText (dlp->m_strain)) {
8185     mod = StringSave (dlp->m_strain);
8186     ptr = StringChr (mod, ';');
8187     if (ptr != NULL) {
8188       *ptr = '\0';
8189     }
8190     if (! x_EndsWithStrain (dlp, mod)) {
8191       ValNodeCopyStr (&strings, 0, " strain ");
8192       ValNodeCopyStr (&strings, 0, mod);
8193     }
8194     MemFree (mod);
8195   }
8196 
8197   if (StringDoesHaveText (dlp->m_chromosome)) {
8198     ValNodeCopyStr (&strings, 0, " chromosome ");
8199     ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
8200   } else if (dlp->m_is_chromosome) {
8201     ValNodeCopyStr (&strings, 0, " chromosome");
8202   }
8203 
8204   if (StringDoesHaveText (dlp->m_plasmid)) {
8205     ValNodeCopyStr (&strings, 0, " plasmid ");
8206     ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
8207   } else if (dlp->m_is_plasmid) {
8208     ValNodeCopyStr (&strings, 0, " plasmid");
8209   }
8210 
8211   if (StringDoesHaveText (dlp->m_isolate)) {
8212     ValNodeCopyStr (&strings, 0, " isolate ");
8213     ValNodeCopyStr (&strings, 0, dlp->m_isolate);
8214   }
8215 
8216   if (StringDoesHaveText (dlp->m_enzyme)) {
8217     ValNodeCopyStr (&strings, 0, ", ");
8218     ValNodeCopyStr (&strings, 0, dlp->m_enzyme);
8219     ValNodeCopyStr (&strings, 0, " whole genome map");
8220   }
8221 
8222   result = x_CatenateValNodeStrings (strings);
8223   ValNodeFreeData (strings);
8224   if (result == NULL) return NULL;
8225 
8226   return result;
8227 }
8228 
x_SetPrefix(DefLinePtr dlp,CharPtr title)8229 static CharPtr x_SetPrefix (
8230   DefLinePtr dlp,
8231   CharPtr title
8232 )
8233 
8234 {
8235   CharPtr  prefix = NULL;
8236 
8237   if (dlp == NULL) return NULL;
8238 
8239   if (dlp->m_is_unverified) {
8240     if (StringStr (title, "UNVERIFIED") == NULL) {
8241       prefix = "UNVERIFIED: ";
8242     }
8243   } else if (dlp->m_is_tsa) {
8244     prefix = "TSA: ";
8245   } else if (dlp->m_is_tls) {
8246     prefix = "TLS: ";
8247   } else if (dlp->m_third_party) {
8248     if (dlp->m_tpa_exp) {
8249       prefix = "TPA_exp: ";
8250     } else if (dlp->m_tpa_inf) {
8251       prefix = "TPA_inf: ";
8252     } else if (dlp->m_tpa_reasm) {
8253       prefix = "TPA_asm: ";
8254     } else {
8255       prefix = "TPA: ";
8256     }
8257   } else if (dlp->m_multispecies && dlp->m_is_wp) {
8258     prefix = "MULTISPECIES: ";
8259   } else if (dlp->m_is_pseudogene) {
8260     if (StringStr (title, "PUTATIVE PSEUDOGENE") == NULL) {
8261       prefix = "PUTATIVE PSEUDOGENE: ";
8262     }
8263   }
8264 
8265   return StringSave (prefix);
8266 }
8267 
CountDeltaGaps(BioseqPtr bsp)8268 static Int4 CountDeltaGaps (
8269   BioseqPtr bsp
8270 )
8271 
8272 {
8273   DeltaSeqPtr  dsp;
8274   Int4         num_gaps = 0;
8275   SeqLitPtr    slitp;
8276   SeqLocPtr    slocp;
8277 
8278   if (bsp == NULL) return 0;
8279 
8280   if (bsp->repr == Seq_repr_delta) {
8281     for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
8282       switch (dsp->choice) {
8283         case 1:
8284           slocp = (SeqLocPtr)(dsp->data.ptrvalue);
8285           if (slocp == NULL) break;
8286           if (slocp->choice == SEQLOC_NULL) {
8287             num_gaps++;
8288           }
8289           break;
8290         case 2:
8291           slitp = (SeqLitPtr)(dsp->data.ptrvalue);
8292           if (slitp == NULL) break;
8293           if (slitp->seq_data == NULL || slitp->seq_data_type == Seq_code_gap) {
8294             num_gaps++;
8295           }
8296           break;
8297         default:
8298           break;
8299       }
8300     }
8301   }
8302 
8303   return num_gaps;
8304 }
8305 
x_SetSuffix(DefLinePtr dlp,CharPtr title,Boolean appendComplete)8306 static CharPtr x_SetSuffix (
8307   DefLinePtr dlp,
8308   CharPtr title,
8309   Boolean appendComplete
8310 )
8311 
8312 {
8313   Char     buf1 [512], buf2 [256];
8314   CharPtr  compl = "", study = "", type = "", un = "ordered", suffix;
8315   size_t   len;
8316   Int4     num_segs, num_gaps;
8317 
8318   if (dlp == NULL) return NULL;
8319 
8320   buf1 [0] = '\0';
8321   buf2 [0] = '\0';
8322 
8323   switch (dlp->m_mi_tech) {
8324       case MI_TECH_htgs_0 :
8325         if (StringStr (title, "LOW-PASS") == NULL) {
8326           type = ", LOW-PASS SEQUENCE SAMPLING";
8327         }
8328         break;
8329       case MI_TECH_htgs_1 :
8330         un = "unordered";
8331         /* fall through */
8332       case MI_TECH_htgs_2 :
8333         if (dlp->m_htgs_draft) {
8334           if (StringStr (title, "WORKING DRAFT") == NULL) {
8335             type = ", WORKING DRAFT SEQUENCE";
8336           }
8337         } else if (! dlp->m_htgs_cancelled) {
8338           if (StringStr (title, "SEQUENCING IN") == NULL) {
8339             type = ", *** SEQUENCING IN PROGRESS ***";
8340           }
8341         }
8342         if (dlp->m_is_delta) {
8343           if (CountGapsInDeltaSeq (dlp->m_bioseq, &num_segs, &num_gaps, NULL, NULL, NULL, 0)) {
8344             if (num_gaps > 0) {
8345               sprintf (buf1, "%s, %ld %s pieces", type, (long) (num_gaps + 1), un);
8346               type = buf1;
8347             }
8348           }
8349         }
8350         break;
8351       case MI_TECH_htgs_3 :
8352         if (StringStr (title, "complete sequence") == NULL) {
8353           type = ", complete sequence";
8354         }
8355         break;
8356       case MI_TECH_est :
8357         if (StringStr (title, "mRNA sequence") == NULL) {
8358           type = ", mRNA sequence";
8359         }
8360         break;
8361       case MI_TECH_sts :
8362         if (StringStr (title, "sequence tagged site") == NULL) {
8363           type = ", sequence tagged site";
8364         }
8365         break;
8366       case MI_TECH_survey :
8367         if (StringStr (title, "genomic survey sequence") == NULL) {
8368           type = ", genomic survey sequence";
8369         }
8370         break;
8371       case MI_TECH_wgs :
8372         if (dlp->m_wgs_master) {
8373           if (StringStr (title, "whole genome shotgun sequencing") == NULL) {
8374             type = ", whole genome shotgun sequencing project";
8375           }
8376         } else if (StringStr (title, "whole genome shotgun sequence") == NULL) {
8377           if (StringDoesHaveText (dlp->m_organelle) && StringStr (title, dlp->m_organelle) == NULL) {
8378             StringCat (buf1, " ");
8379             StringCat (buf1, dlp->m_organelle);
8380           }
8381           StringCat (buf1, ", whole genome shotgun sequence");
8382           type = buf1;
8383         }
8384         break;
8385       case MI_TECH_tsa :
8386         if (dlp->m_tsa_master) {
8387           if (StringStr (title, "transcriptome shotgun assembly") == NULL) {
8388             type = ", transcriptome shotgun assembly";
8389           }
8390         } else if (StringStr (title, "RNA sequence") == NULL) {
8391           switch (dlp->m_mi_biomol) {
8392               case MOLECULE_TYPE_MRNA :
8393                 type = ", mRNA sequence";
8394                 break;
8395               case MOLECULE_TYPE_RRNA :
8396                 type = ", rRNA sequence";
8397                 break;
8398               case MOLECULE_TYPE_NCRNA :
8399                 type = ", ncRNA sequence";
8400                 break;
8401               case MOLECULE_TYPE_PRE_MRNA :
8402               case MOLECULE_TYPE_SNRNA :
8403               case MOLECULE_TYPE_SCRNA :
8404               case MOLECULE_TYPE_CRNA :
8405               case MOLECULE_TYPE_SNORNA :
8406               case MOLECULE_TYPE_TRANSCRIBED_RNA :
8407                 type = ", transcribed RNA sequence";
8408                 break;
8409               default :
8410                 break;
8411           }
8412         }
8413         break;
8414       case MI_TECH_targeted :
8415         if (dlp->m_tls_master) {
8416           if (StringStr (title, "targeted locus study") == NULL) {
8417             type = ", targeted locus study";
8418           }
8419         } else {
8420           if (StringStr (title, "sequence") == NULL) {
8421             type = ", sequence";
8422           }
8423         }
8424         if (StringDoesHaveText (dlp->m_targeted_locus) && StringStr (title, dlp->m_targeted_locus) == NULL) {
8425             study = dlp->m_targeted_locus;
8426         }
8427         break;
8428       default :
8429         break;
8430   }
8431 
8432   if (appendComplete && StringStr (title, "complete") == NULL && StringStr (title, "partial") == NULL) {
8433     if (dlp->m_mi_completeness == 1) {
8434       if (dlp->m_is_plasmid) {
8435         compl = ", complete sequence";
8436       } else if (dlp->m_genome == GENOME_mitochondrion ||
8437                  dlp->m_genome == GENOME_chloroplast ||
8438                  dlp->m_genome == GENOME_kinetoplast ||
8439                  dlp->m_genome == GENOME_plastid ||
8440                  dlp->m_genome == GENOME_apicoplast) {
8441         compl = ", complete genome";
8442       } else if (dlp->m_is_chromosome) {
8443         if (StringDoesHaveText (dlp->m_chromosome)) {
8444           compl = ", complete sequence";
8445         } else {
8446           compl = ", complete genome";
8447         }
8448       }
8449     }
8450   }
8451 
8452   if (dlp->m_unordered && dlp->m_is_delta) {
8453     num_gaps = CountDeltaGaps (dlp->m_bioseq);
8454     if (num_gaps > 0) {
8455       sprintf (buf1, ", %ld unordered pieces", (long) (num_gaps + 1));
8456       type = buf1;
8457     }
8458   }
8459 
8460   len = StringLen (type) + StringLen (study) + StringLen (compl) + 5;
8461   suffix = (CharPtr) MemNew (len * sizeof (Char));
8462   if (suffix == NULL) return NULL;
8463 
8464   suffix [0] = '\0';
8465   if (StringDoesHaveText (study)) {
8466     StringCat (suffix, " ");
8467     StringCat (suffix, study);
8468   }
8469   StringCat (suffix, type);
8470   StringCat (suffix, compl);
8471 
8472   return suffix;
8473 }
8474 
8475 static CharPtr tpa_prefix_list [] = {
8476   "TPA:",
8477   "TPA_exp:",
8478   "TPA_inf:",
8479   "TPA_reasm:",
8480   "TPA_asm:",
8481   "TSA:",
8482   "UNVERIFIED:",
8483   NULL
8484 };
8485 
NewCreateDefLineExEx(ItemInfoPtr iip,BioseqPtr bsp,Boolean ignoreTitle,Boolean extProtTitle,Boolean gpipeMode,Boolean devMode)8486 NLM_EXTERN CharPtr NewCreateDefLineExEx (
8487   ItemInfoPtr iip,
8488   BioseqPtr bsp,
8489   Boolean ignoreTitle,
8490   Boolean extProtTitle,
8491   Boolean gpipeMode,
8492   Boolean devMode
8493 )
8494 
8495 {
8496   Boolean        appendComplete = FALSE;
8497   Boolean        capitalize = TRUE;
8498   Char           ch;
8499   DefLinePtr     dlp;
8500   Uint2          entityID;
8501   int            i;
8502   size_t         len;
8503   ObjValNodePtr  ovp;
8504   CharPtr        result = NULL, prefix = NULL, suffix = NULL, title = NULL, fix = NULL;
8505   SeqDescrPtr    sdp = NULL;
8506   CharPtr        str = NULL;
8507 
8508   if (bsp == NULL) return NULL;
8509 
8510   /* now using GetNextDescriptorUnindexed, so need to have called AssignIDsInEntityEx */
8511   if (bsp->idx.entityID == 0) {
8512     entityID = ObjMgrGetEntityIDForPointer (bsp);
8513     if (entityID != 0) {
8514       AssignIDsInEntityEx (entityID, 0, NULL, NULL);
8515     }
8516   }
8517 
8518   dlp = (DefLinePtr) MemNew (sizeof (DefLineData));
8519   if (dlp == NULL) return NULL;
8520 
8521   dlp->m_low_quality_fsa = TextFsaNew ();
8522   TextFsaAdd (dlp->m_low_quality_fsa, "heterogeneous population sequenced");
8523   TextFsaAdd (dlp->m_low_quality_fsa, "low-quality sequence region");
8524   TextFsaAdd (dlp->m_low_quality_fsa, "unextendable partial coding region");
8525 
8526   /* set flags from record components */
8527   dlp->m_iip = iip;
8528   dlp->m_bioseq = bsp;
8529 
8530   dlp->m_reconstruct = ignoreTitle;
8531   dlp->m_allprotnames = extProtTitle;
8532 
8533   dlp->m_gpipemode = gpipeMode;
8534   dlp->m_devmode = devMode;
8535 
8536   /* clear ItemInfo fields */
8537   if (iip != NULL) {
8538     iip->entityID = 0;
8539     iip->itemID = 0;
8540     iip->itemtype = 0;
8541   }
8542 
8543   /* set flags from record components */
8544   x_SetFlags (dlp);
8545 
8546   if (! dlp->m_reconstruct) {
8547     /* look for existing instantiated title */
8548     if (dlp->m_is_aa && (! dlp->m_is_pdb)) {
8549       sdp = BioseqGetSeqDescr (bsp, Seq_descr_title, NULL);
8550       if (sdp != NULL && sdp->choice == Seq_descr_title) {
8551         str = (CharPtr) sdp->data.ptrvalue;
8552       }
8553     } else {
8554       sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_title, NULL);
8555       if (sdp != NULL && sdp->choice == Seq_descr_title) {
8556         str = (CharPtr) sdp->data.ptrvalue;
8557       }
8558     }
8559     if (StringDoesHaveText (str)) {
8560       title = StringSave (str);
8561       /* strip trailing periods, commas, semicolons, etc. */
8562       x_TrimPunctuationFromEnd (title);
8563       capitalize = FALSE;
8564 
8565       /* set ItemInfo fields for selection */
8566       if (iip != NULL && sdp != NULL && sdp->extended != 0) {
8567         ovp = (ObjValNodePtr) sdp;
8568         iip->entityID = ovp->idx.entityID;
8569         iip->itemtype = ovp->idx.itemtype;
8570         iip->itemID = ovp->idx.itemID;
8571       }
8572     }
8573   }
8574 
8575   /* use appropriate algorithm if title needs to be generated */
8576   if (StringHasNoText (title)) {
8577     /* PDB and patent records do not normally need source data */
8578     if (dlp->m_is_pdb) {
8579       title = x_TitleFromPDB (dlp);
8580     } else if (dlp->m_is_patent) {
8581       title = x_TitleFromPatent (dlp);
8582     }
8583 
8584     if (StringHasNoText (title)) {
8585       /* set fields from source information */
8586       x_SetBioSrc (dlp);
8587 
8588       /* several record types have specific methods */
8589       if (dlp->m_is_nc) {
8590         title = x_TitleFromNC (dlp);
8591       } else if (dlp->m_is_nm) {
8592         title = x_TitleFromNM (dlp);
8593       } else if (dlp->m_is_nr) {
8594         title = x_TitleFromNR (dlp);
8595       } else if (dlp->m_is_aa) {
8596         title = x_TitleFromProtein (dlp);
8597       } else if (dlp->m_is_seg && (! dlp->m_is_est_sts_gss)) {
8598         title = x_TitleFromSegSeq (dlp);
8599       } else if (dlp->m_is_tsa || (dlp->m_is_wgs && (! dlp->m_wgs_master)) || (dlp->m_is_tls && (! dlp->m_tls_master))) {
8600         title = x_TitleFromWGS (dlp);
8601       } else if (dlp->m_is_map) {
8602         title = x_TitleFromMap (dlp);
8603       }
8604     }
8605 
8606     if (StringHasNoText (title) && dlp->m_gpipemode) {
8607       /* title using gpipe policy */
8608       title = x_TitleFromGPipe (dlp);
8609     }
8610 
8611     if (StringHasNoText (title)) {
8612       /* default title using source fields */
8613       title = x_TitleFromBioSrc (dlp);
8614       if (dlp->m_mi_completeness == 1 && StringDoesHaveText (title)) {
8615         appendComplete = TRUE;
8616       }
8617     }
8618 
8619     if (StringHasNoText (title)) {
8620       /* last resort title created here */
8621       /*
8622       title = StringSave ("No definition line found");
8623       */
8624     }
8625   }
8626 
8627   /* remove TPA or TSA prefix, will rely on other data in record to set */
8628   for (i = 0; tpa_prefix_list [i] != NULL; i++) {
8629     len = StringLen (tpa_prefix_list [i]);
8630     if (StringNICmp (title, tpa_prefix_list [i], len) == 0) {
8631       x_TrimFirstNCharacters (title, len);
8632     }
8633   }
8634 
8635   /* strip leading spaces remaining after removal of old TPA or TSA prefixes */
8636   TrimSpacesAroundString (title);
8637 
8638   /* strip trailing commas, semicolons, and spaces (period may be an sp. species) */
8639   x_TrimMostPunctFromEnd (title);
8640 
8641   /* calcualte prefix */
8642   prefix = x_SetPrefix (dlp, title);
8643 
8644   /* calculate suffix */
8645   suffix = x_SetSuffix (dlp, title, appendComplete);
8646 
8647   len = StringLen (prefix) + StringLen (title) + StringLen (suffix) + 4;
8648   result = (CharPtr) MemNew (sizeof (Char) * len);
8649 
8650   if (result != NULL) {
8651     StringCat (result, prefix);
8652     StringCat (result, title);
8653     StringCat (result, suffix);
8654 
8655     if (dlp->m_is_aa) {
8656       fix = StringStr (result, ". [");
8657       if (fix == NULL) {
8658         fix = StringStr (result, ", [");
8659       }
8660       if (fix != NULL) {
8661         *fix = ' ';
8662       }
8663     }
8664 
8665     fix = StringStr (result, " ,");
8666       if (fix != NULL) {
8667         fix [0] = ',';
8668         fix [1] = ' ';
8669       }
8670 
8671     fix = StringStr (result, ",,");
8672       if (fix != NULL) {
8673         fix [1] = ' ';
8674       }
8675   }
8676 
8677   MemFree (prefix);
8678   MemFree (title);
8679   MemFree (suffix);
8680 
8681   TextFsaFree (dlp->m_low_quality_fsa);
8682 
8683   Asn2gnbkCompressSpaces (result);
8684 
8685   if (! dlp->m_is_pdb && ! dlp->m_is_patent && ! dlp->m_is_aa && ! dlp->m_is_seg) {
8686     if (result != NULL) {
8687       ch = result [0];
8688       if (IS_LOWER (ch) && capitalize) {
8689         result [0] = TO_UPPER (ch);
8690       }
8691     }
8692   }
8693 
8694   dlp = MemFree (dlp);
8695 
8696   return result;
8697 }
8698 
NewCreateDefLineEx(ItemInfoPtr iip,BioseqPtr bsp,Boolean ignoreTitle,Boolean extProtTitle,Boolean gpipeMode)8699 NLM_EXTERN CharPtr NewCreateDefLineEx (
8700   ItemInfoPtr iip,
8701   BioseqPtr bsp,
8702   Boolean ignoreTitle,
8703   Boolean extProtTitle,
8704   Boolean gpipeMode
8705 )
8706 
8707 {
8708   return NewCreateDefLineExEx (iip, bsp, ignoreTitle, extProtTitle, gpipeMode, FALSE);
8709 }
8710 
NewCreateDefLine(ItemInfoPtr iip,BioseqPtr bsp,Boolean ignoreTitle,Boolean extProtTitle)8711 NLM_EXTERN CharPtr NewCreateDefLine (
8712   ItemInfoPtr iip,
8713   BioseqPtr bsp,
8714   Boolean ignoreTitle,
8715   Boolean extProtTitle
8716 )
8717 
8718 {
8719   return NewCreateDefLineExEx (iip, bsp, ignoreTitle, extProtTitle, FALSE, FALSE);
8720 }
8721 
NewCreateDefLineBuf(ItemInfoPtr iip,BioseqPtr bsp,CharPtr buf,Uint4 buflen,Boolean ignoreTitle,Boolean extProtTitle)8722 NLM_EXTERN Boolean NewCreateDefLineBuf (
8723   ItemInfoPtr iip,
8724   BioseqPtr bsp,
8725   CharPtr buf,
8726   Uint4 buflen,
8727   Boolean ignoreTitle,
8728   Boolean extProtTitle)
8729 
8730 {
8731   CharPtr  title = NULL;
8732 
8733   if (bsp == NULL || buf == NULL|| buflen == 0) return FALSE;
8734 
8735   title = NewCreateDefLineEx (iip, bsp, ignoreTitle, extProtTitle, FALSE);
8736   StringNCpy_0 (buf, title, buflen);
8737   MemFree (title);
8738 
8739   return TRUE;
8740 }
8741 
8742