1 /* tofasta.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: tofasta.c
27 *
28 * Author: James Ostell
29 *
30 * Version Creation Date: 7/12/91
31 *
32 * $Revision: 6.313 $
33 *
34 * File Description: various sequence objects to fasta output
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44 #include <tofasta.h>
45 #include <gather.h>
46 #include <sqnutils.h> /* MakeSeqID */
47 #include <subutil.h> /* MOLECULE_TYPE_GENOMIC */
48 #include <explore.h>
49 #include <objloc.h>
50 #include <objfdef.h>
51 #include <asn2gnbi.h>
52 #include <objmacro.h>
53 #include <macroapi.h>
54
55 #ifdef OS_UNIX_DARWIN
56 #define NLM_GETC fgetc
57 #else
58 #define NLM_GETC getc
59 #endif
60 #define SeqLocNew(_a) ValNodeNew((_a))
61
62 static Uint1 na_order[NUM_SEQID] = { /* order of nucleic acid deflines */
63 255, /* 0 = not set */
64 230, /* 1 = local Object-id */
65 30, /* 2 = gibbsq */
66 30, /* 3 = gibbmt */
67 255, /* 4 = giim Giimport-id */
68 20, /* 5 = genbank */
69 20, /* 6 = embl */
70 255, /* 7 = pir */
71 255, /* 8 = swissprot */
72 40, /* 9 = patent */
73 15, /* 10 = other TextSeqId (RefGene) */
74 50, /* 11 = general Dbtag */
75 120, /* 12 = gi */
76 20, /* 13 = ddbj */
77 255, /* 14 = prf */
78 30, /* 15 = pdb */
79 20, /* 16 = tpg */
80 20, /* 17 = tpe */
81 20, /* 18 = tpd */
82 20, /* 19 = gpp */
83 20 /* 30 = nat */
84 };
85 static Uint1 aa_order[NUM_SEQID] = { /* order of nucleic acid deflines */
86 255, /* 0 = not set */
87 230, /* 1 = local Object-id */
88 40, /* 2 = gibbsq */
89 40, /* 3 = gibbmt */
90 255, /* 4 = giim Giimport-id */
91 60, /* 5 = genbank */
92 60, /* 6 = embl */
93 30, /* 7 = pir */
94 20, /* 8 = swissprot */
95 80, /* 9 = patent */
96 15, /* 10 = other TextSeqId (RefGene) */
97 90, /* 11 = general Dbtag */
98 120, /* 12 = gi */
99 60, /* 13 = ddbj */
100 70, /* 14 = prf */
101 50, /* 15 = pdb */
102 60, /* 16 = tpg */
103 60, /* 17 = tpe */
104 60, /* 18 = tpd */
105 60, /* 19 = gpp */
106 60 /* 20 = nat */
107 };
108 #define FASTA_BUFFER_LEN 524288
109 #define PATENT_ORDER 110 /* order for any patent */
110 /*****************************************************************************
111 *
112 * The above sets the ordering to be, lowest to highest
113 *
114 Nucleic Acids:
115 GenBank/EMBL/DDBJ
116 PDB
117 Patents
118 Anything else
119 Proteins:
120 SWISSPROT
121 PIR
122 NCBI BackBone (but not in GenBank)
123 PDB
124 GenBank/EMBL/DDBJ translations
125 PRF
126 Patents
127 Anything else
128 *
129 *****************************************************************************/
GetOrderBySeqId(Int4 choice,Boolean is_prot)130 Int4 GetOrderBySeqId(Int4 choice, Boolean is_prot)
131 {
132 if(choice > NUM_SEQID)
133 return -1;
134 if(is_prot)
135 return aa_order[choice];
136 else
137 return na_order[choice];
138 }
139 /*****************************************************************************
140 *
141 * Traversal routine for SeqEntryToFasta
142 *
143 *****************************************************************************/
SeqEntryFasta(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)144 void SeqEntryFasta (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
145 {
146 FastaPtr tfa;
147 BioseqPtr bsp = NULL;
148 BioseqSetPtr bssp = NULL;
149 MyFsaPtr mfp;
150 Boolean is_na;
151 SeqIdPtr sip;
152 TextSeqIdPtr tsip;
153 ValNodePtr vnp;
154 OrgRefPtr orp;
155 MolInfoPtr mip;
156 tfa = (FastaPtr) data;
157 mfp = tfa->mfp;
158 if (tfa->group_segs == 2) /* put out only segments */
159 {
160 if (tfa->parts != -1) /* in parts set */
161 {
162 if (indent <= tfa->parts) /* out of parts set */
163 {
164 tfa->parts = -1;
165 tfa->seg = -1;
166 }
167 }
168 }
169 if (IS_Bioseq(sep))
170 {
171 bsp = (BioseqPtr)(sep->data.ptrvalue);
172 vnp = bsp->descr;
173 }
174 else
175 {
176 bssp = (BioseqSetPtr)(sep->data.ptrvalue);
177 vnp = bssp->descr;
178 }
179 orp = NULL;
180 mip = NULL;
181 while (vnp != NULL) /* check for organism info */
182 {
183 switch (vnp->choice)
184 {
185 case Seq_descr_source:
186 orp = ((BioSourcePtr)(vnp->data.ptrvalue))->org;
187 break;
188 case Seq_descr_org:
189 orp = (OrgRefPtr)(vnp->data.ptrvalue);
190 break;
191 case Seq_descr_molinfo:
192 mip = (MolInfoPtr)(vnp->data.ptrvalue);
193 break;
194 default:
195 break;
196 }
197 vnp = vnp->next;
198 }
199 if (orp != NULL)
200 {
201 if (orp->taxname != NULL)
202 mfp->organism = orp->taxname;
203 else if (orp->common != NULL)
204 mfp->organism = orp->common;
205 }
206 if (mip != NULL)
207 mfp->tech = mip->tech;
208 else
209 mfp->tech = 0 ;
210 if (! IS_Bioseq(sep)) /* check for taking only parts of seg seqs */
211 {
212 if (tfa->group_segs == 2) /* put out only segments */
213 {
214 if (bssp->_class == 2) /* segset */
215 tfa->seg = indent;
216 else if (bssp->_class == 4) /* parts */
217 {
218 if ((tfa->seg >= 0) && (tfa->seg < indent))
219 {
220 tfa->parts = indent; /* in parts set */
221 }
222 }
223 }
224 return;
225 }
226 is_na = tfa->is_na;
227 if ((! is_na) && (! ISA_aa(bsp->mol))) /* check for translations */
228 {
229 for (sip = bsp->id; sip != NULL; sip = sip->next)
230 {
231 switch (sip->choice)
232 {
233 case SEQID_GENBANK:
234 case SEQID_EMBL:
235 case SEQID_DDBJ:
236 case SEQID_OTHER:
237 case SEQID_TPG:
238 case SEQID_TPE:
239 case SEQID_TPD:
240 case SEQID_GPIPE:
241 tsip = (TextSeqIdPtr)(sip->data.ptrvalue);
242 if (tsip->accession != NULL)
243 mfp->accession = tsip->accession;
244 break;
245 default:
246 break;
247 }
248 }
249 }
250 if (tfa->last_indent != -1) /* putting out segments together */
251 {
252 if (indent > tfa->last_indent)
253 return;
254 tfa->last_indent = -1;
255 }
256 /* do raw bioseqs only */
257 if (! tfa->group_segs)
258 {
259 if (BioseqRawToFastaX(bsp, mfp, is_na))
260 tfa->got_one = TRUE;
261 }
262 else if (tfa->group_segs == 1) /* do segmented sets */
263 {
264 if (BioseqToFastaX(bsp, mfp, is_na))
265 {
266 tfa->got_one = TRUE;
267 if (bsp->repr == Seq_repr_seg)
268 tfa->last_indent = indent;
269 }
270 }
271 else if (tfa->group_segs == 2) /* take only the parts */
272 {
273 if (tfa->parts >= 0) /* in segmented parts set */
274 {
275 if (BioseqRawToFastaX(bsp, mfp, is_na))
276 tfa->got_one = TRUE;
277 }
278 }
279 return;
280 }
281 /*****************************************************************************
282 *
283 * SeqEntryToFasta(sep, fp, is_na)
284 *
285 *****************************************************************************/
SeqEntryToFasta(SeqEntryPtr sep,FILE * fp,Boolean is_na)286 NLM_EXTERN Boolean SeqEntryToFasta (SeqEntryPtr sep, FILE *fp, Boolean is_na)
287 {
288 if (IS_Bioseq(sep))
289 return SeqEntrysToFasta(sep, fp, is_na, 3);
290 else
291 return SeqEntrysToFasta(sep, fp, is_na, 0);
292 }
293 static Boolean SeqEntrysToFastaXX (SeqEntryPtr sep, FILE *fp, Boolean is_na, Uint1 group_segs, Boolean printid_general);
SeqEntryToFastaEx(SeqEntryPtr sep,FILE * fp,Boolean is_na,Boolean printid_general)294 NLM_EXTERN Boolean SeqEntryToFastaEx (SeqEntryPtr sep, FILE *fp, Boolean is_na, Boolean printid_general)
295 {
296 if (IS_Bioseq(sep))
297 return SeqEntrysToFastaXX(sep, fp, is_na, 3, printid_general);
298 else
299 return SeqEntrysToFastaXX(sep, fp, is_na, 0, printid_general);
300 }
301 /*****************************************************************************
302 *
303 * FastaFileFunc(key, buf, data)
304 * standard "write to file" callback
305 *
306 *****************************************************************************/
FastaFileFunc(BioseqPtr bsp,Int2 key,CharPtr buf,Uint4 buflen,Pointer data)307 NLM_EXTERN Boolean FastaFileFunc (BioseqPtr bsp, Int2 key, CharPtr buf,
308 Uint4 buflen, Pointer data)
309 {
310 FILE * fp;
311 fp = (FILE *)data;
312 switch (key)
313 {
314 case FASTA_ID:
315 fprintf(fp, ">%s ", buf);
316 break;
317 case FASTA_DEFLINE:
318 fprintf(fp, "%s\n", buf);
319 break;
320 case FASTA_SEQLINE:
321 fprintf(fp, "%s\n", buf);
322 break;
323 case FASTA_EOS: /* end of sequence */
324 break;
325 default:
326 break;
327 }
328 return TRUE;
329 }
330 /*****************************************************************************
331 *
332 * FastaFileFunc(key, buf, data)
333 * standard "write to file" callback
334 *
335 * Used for BLAST (FASTA) databases. If the defline is
336 * longer than buflen, then check that an ID is not
337 * truncated in the middle.
338 *
339 *****************************************************************************/
FastaDumpFileFunc(BioseqPtr bsp,Int2 key,CharPtr buf,Uint4 buflen,Pointer data)340 NLM_EXTERN Boolean FastaDumpFileFunc (BioseqPtr bsp, Int2 key, CharPtr buf,
341 Uint4 buflen, Pointer data)
342 {
343 FILE * fp;
344 fp = (FILE *)data;
345 switch (key)
346 {
347 case FASTA_ID:
348 fprintf(fp, ">%s ", buf);
349 break;
350 case FASTA_DEFLINE:
351 if (buflen >= FASTA_BUFFER_LEN-1)
352 {
353 Uint4 index=buflen;
354 while (index > 0 && buf[index] != ' ')
355 {
356 if (buf[index] == '\001')
357 {
358 buf[index] = NULLB;
359 break;
360 }
361 index--;
362 }
363 }
364 fprintf(fp, "%s\n", buf);
365 break;
366 case FASTA_SEQLINE:
367 fprintf(fp, "%s\n", buf);
368 break;
369 case FASTA_EOS: /* end of sequence */
370 break;
371 default:
372 break;
373 }
374 return TRUE;
375 }
376 /*****************************************************************************
377 *
378 * SeqEntrysToFasta(sep, fp, is_na, group_segs)
379 *
380 * group_segs = 0 ... take only raw Bioseqs
381 * group_segs = 1 ... group segmented seqs into single entry.. no parts
382 * group_segs = 2 ... show only parts of segmented seqs
383 * group_segs = 3 ... like 1, but instantiate virtual Bioseqs
384 *
385 *****************************************************************************/
SeqEntrysToFastaXX(SeqEntryPtr sep,FILE * fp,Boolean is_na,Uint1 group_segs,Boolean printid_general)386 static Boolean SeqEntrysToFastaXX (SeqEntryPtr sep, FILE *fp, Boolean is_na, Uint1 group_segs, Boolean printid_general)
387 {
388 FastaDat tfa;
389 MyFsa mfa;
390 Char buf[FASTA_BUFFER_LEN+1];
391 if ((sep == NULL) || (fp == NULL))
392 return FALSE;
393 MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
394 mfa.buf = buf;
395 mfa.buflen = FASTA_BUFFER_LEN;
396 mfa.seqlen = 70;
397 mfa.mydata = (Pointer)fp;
398 mfa.myfunc = FastaFileFunc;
399 mfa.bad_asn1 = FALSE;
400 mfa.order = 0;
401 mfa.accession = NULL;
402 mfa.organism = NULL;
403 mfa.do_virtual = FALSE;
404 mfa.tech = 0;
405 mfa.no_sequence = FALSE;
406 mfa.formatdb = FALSE;
407 mfa.printid_general = printid_general;
408 mfa.seqloc = NULL;
409 tfa.mfp = &mfa;
410 tfa.is_na = is_na;
411 if (is_na)
412 mfa.code = Seq_code_iupacna;
413 else
414 mfa.code = Seq_code_ncbieaa;
415 if (group_segs == 3) /* do 2 things */
416 {
417 mfa.do_virtual = TRUE;
418 group_segs = 1;
419 }
420 tfa.group_segs = group_segs;
421 tfa.last_indent = -1;
422 tfa.parts = -1;
423 tfa.seg = -1;
424 tfa.got_one = FALSE;
425 SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryFasta);
426 return tfa.got_one;
427 }
SeqEntrysToFasta(SeqEntryPtr sep,FILE * fp,Boolean is_na,Uint1 group_segs)428 NLM_EXTERN Boolean SeqEntrysToFasta (SeqEntryPtr sep, FILE *fp, Boolean is_na, Uint1 group_segs)
429 {
430 return SeqEntrysToFastaXX (sep, fp, is_na, group_segs, FALSE);
431 }
432 /*****************************************************************************
433 *
434 * SeqEntrysToFastaX(sep, mfa, is_na, group_segs)
435 *
436 *****************************************************************************/
SeqEntrysToFastaX(SeqEntryPtr sep,MyFsaPtr mfp,Boolean is_na,Uint1 group_segs)437 NLM_EXTERN Boolean SeqEntrysToFastaX (SeqEntryPtr sep, MyFsaPtr mfp, Boolean is_na, Uint1 group_segs)
438 {
439 FastaDat tfa;
440 if ((sep == NULL) || (mfp == NULL))
441 return FALSE;
442 tfa.mfp = mfp;
443 tfa.is_na = is_na;
444 if (group_segs == 3) /* do 2 things */
445 {
446 mfp->do_virtual = TRUE;
447 group_segs = 1;
448 }
449 tfa.group_segs = group_segs;
450 tfa.last_indent = -1;
451 tfa.parts = -1;
452 tfa.seg = -1;
453 tfa.got_one = FALSE;
454 SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryFasta);
455 return tfa.got_one;
456 }
457 /*****************************************************************************
458 *
459 * SeqEntrysToDefline(sep, mfa, is_na, group_segs)
460 *
461 *****************************************************************************/
462 #define DEFLINE_MAX_LEN FASTA_BUFFER_LEN
SeqEntrysToDefline(SeqEntryPtr sep,FILE * fp,Boolean is_na,Uint1 group_segs)463 NLM_EXTERN Boolean SeqEntrysToDefline(SeqEntryPtr sep,
464 FILE *fp, Boolean is_na, Uint1 group_segs)
465 {
466 FastaDat tfa;
467 MyFsa mfa;
468 if ((sep == NULL) || (fp == NULL))
469 return FALSE;
470 MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
471 mfa.buf = (CharPtr) MemNew(DEFLINE_MAX_LEN);
472 mfa.buflen = DEFLINE_MAX_LEN-1;
473 mfa.seqlen = DEFLINE_MAX_LEN;
474 mfa.mydata = (Pointer)fp;
475 mfa.myfunc = FastaFileFunc;
476 mfa.no_sequence = TRUE;
477 mfa.bad_asn1 = FALSE;
478 mfa.order = 0;
479 mfa.accession = NULL;
480 mfa.organism = NULL;
481 mfa.do_virtual = FALSE;
482 mfa.formatdb = FALSE;
483 mfa.tech = 0;
484 mfa.printid_general = FALSE;
485 mfa.seqloc = NULL;
486 tfa.mfp = &mfa;
487 tfa.is_na = is_na;
488 if (group_segs == 3) /* do 2 things */
489 {
490 mfa.do_virtual = TRUE;
491 group_segs = 1;
492 }
493 tfa.group_segs = group_segs;
494 tfa.last_indent = -1;
495 tfa.parts = -1;
496 tfa.seg = -1;
497 tfa.got_one = FALSE;
498 SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryFasta);
499 MemFree(mfa.buf);
500 return tfa.got_one;
501 }
502 /*****************************************************************************
503 *
504 * Boolean BioseqRawToFasta(bsp, fp, is_na)
505 *
506 *****************************************************************************/
BioseqRawToFasta(BioseqPtr bsp,FILE * fp,Boolean is_na)507 NLM_EXTERN Boolean BioseqRawToFasta (BioseqPtr bsp, FILE *fp, Boolean is_na)
508 {
509 return BioseqRawToFastaExtra(bsp, fp, 80);
510 }
BioseqRawToFastaExtra(BioseqPtr bsp,FILE * fp,Int2 line_length)511 NLM_EXTERN Boolean BioseqRawToFastaExtra (BioseqPtr bsp, FILE *fp, Int2 line_length)
512 {
513 return BioseqRawToFastaExtraEx (bsp, fp, line_length, NULL);
514 }
BioseqRawToFastaExtraEx(BioseqPtr bsp,FILE * fp,Int2 line_length,SeqLocPtr slp)515 NLM_EXTERN Boolean BioseqRawToFastaExtraEx(BioseqPtr bsp, FILE *fp, Int2 line_length, SeqLocPtr slp)
516 {
517 MyFsa mfa;
518 Char buf[FASTA_BUFFER_LEN+1];
519 if ((bsp == NULL) || (fp == NULL))
520 return FALSE;
521 MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
522 mfa.buf = buf;
523 mfa.buflen = FASTA_BUFFER_LEN;
524 mfa.seqlen = line_length;
525 mfa.mydata = (Pointer)fp;
526 mfa.myfunc = FastaFileFunc;
527 mfa.bad_asn1 = FALSE;
528 mfa.order = 0;
529 mfa.accession = NULL;
530 mfa.organism = NULL;
531 mfa.do_virtual = FALSE;
532 mfa.tech = 0;
533 mfa.no_sequence = FALSE;
534 mfa.formatdb = FALSE;
535 mfa.printid_general = FALSE;
536 mfa.seqloc = slp;
537 return BioseqRawToFastaX(bsp, &mfa, ISA_na(bsp->mol));
538 }
539 /*****************************************************************************
540 *
541 * Boolean BioseqRawToFastaX(bsp, mfp, is_na)
542 *
543 *****************************************************************************/
BioseqRawToFastaX(BioseqPtr bsp,MyFsaPtr mfp,Boolean is_na)544 NLM_EXTERN Boolean BioseqRawToFastaX (BioseqPtr bsp, MyFsaPtr mfp, Boolean is_na)
545 {
546 Uint1 repr;
547 if ((bsp == NULL) || (mfp == NULL))
548 return FALSE;
549 repr = Bioseq_repr(bsp);
550 if (! ((repr == Seq_repr_raw) || (repr == Seq_repr_const)))
551 return FALSE;
552 return BioseqToFastaX(bsp, mfp, is_na);
553 }
554 /*****************************************************************************
555 *
556 * Boolean BioseqToFasta(bsp, fp, is_na)
557 *
558 *****************************************************************************/
BioseqToFasta(BioseqPtr bsp,FILE * fp,Boolean is_na)559 NLM_EXTERN Boolean BioseqToFasta (BioseqPtr bsp, FILE *fp, Boolean is_na)
560 {
561 MyFsa mfa;
562 Char buf[FASTA_BUFFER_LEN+1];
563 if ((bsp == NULL) || (fp == NULL))
564 return FALSE;
565 MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
566 mfa.buf = buf;
567 mfa.buflen = FASTA_BUFFER_LEN;
568 mfa.seqlen = 80;
569 mfa.mydata = (Pointer)fp;
570 mfa.myfunc = FastaFileFunc;
571 mfa.bad_asn1 = FALSE;
572 mfa.order = 0;
573 mfa.accession = NULL;
574 mfa.organism = NULL;
575 mfa.do_virtual = FALSE;
576 mfa.tech = 0;
577 mfa.no_sequence = FALSE;
578 mfa.formatdb = FALSE;
579 mfa.printid_general = FALSE;
580 mfa.seqloc = NULL;
581 return BioseqToFastaX(bsp, &mfa, is_na);
582 }
583 /*****************************************************************************
584 *
585 * Boolean BioseqToFastaDump(bsp, fp, is_na)
586 *
587 *****************************************************************************/
BioseqToFastaDump(BioseqPtr bsp,FILE * fp,Boolean is_na)588 NLM_EXTERN Boolean BioseqToFastaDump (BioseqPtr bsp, FILE *fp, Boolean is_na)
589 {
590 MyFsa mfa;
591 Char buf[FASTA_BUFFER_LEN+1];
592 if ((bsp == NULL) || (fp == NULL))
593 return FALSE;
594 MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
595 mfa.buf = buf;
596 mfa.buflen = FASTA_BUFFER_LEN;
597 mfa.seqlen = 80;
598 mfa.mydata = (Pointer)fp;
599 mfa.myfunc = FastaDumpFileFunc;
600 mfa.bad_asn1 = FALSE;
601 mfa.order = 0;
602 mfa.accession = NULL;
603 mfa.organism = NULL;
604 mfa.do_virtual = FALSE;
605 mfa.tech = 0;
606 mfa.no_sequence = FALSE;
607 mfa.formatdb = FALSE;
608 mfa.printid_general = FALSE;
609 mfa.seqloc = NULL;
610 return BioseqToFastaX(bsp, &mfa, is_na);
611 }
612 /*****************************************************************************
613 *
614 * Boolean BioseqToFastaX(bsp, mfp, is_na)
615 *
616 *****************************************************************************/
617 static Boolean FastaIdX(BioseqPtr bsp, CharPtr buf, Uint4 buflen, Boolean printid_general, SeqLocPtr seqloc);
BioseqToFastaX(BioseqPtr bsp,MyFsaPtr mfp,Boolean is_na)618 NLM_EXTERN Boolean BioseqToFastaX (BioseqPtr bsp, MyFsaPtr mfp, Boolean is_na)
619 {
620 SeqPortPtr spp;
621 Uint1 repr, code;
622 Char buf[41];
623 SeqIdPtr sip;
624 Uint1 order = 255;
625 Boolean is_patent = FALSE, is_genbank = FALSE;
626 Uint1Ptr order_array;
627 int i;
628 CharPtr organism = NULL;
629 if ((bsp == NULL) || (mfp == NULL))
630 return FALSE;
631 repr = Bioseq_repr(bsp);
632 if (ISA_na(bsp->mol))
633 {
634 if (! is_na)
635 return FALSE;
636 order_array = na_order;
637 }
638 else if (ISA_aa(bsp->mol))
639 {
640 if (is_na)
641 return FALSE;
642 order_array = aa_order;
643 if (mfp->accession != NULL) /* translated genbank */
644 {
645 order = order_array[SEQID_GENBANK];
646 is_genbank = TRUE;
647 organism = mfp->organism;
648 }
649 }
650 else
651 {
652 buf[0] = '\0';
653 SeqIdWrite(SeqIdFindBest(bsp->id, 0), buf, PRINTID_FASTA_LONG, 40);
654 ErrPostEx(SEV_ERROR,0,0,"ToFasta: [%s] Unrecognized bsp->mol = %d",
655 buf, (int)(bsp->mol));
656 mfp->bad_asn1 = TRUE;
657 return FALSE;
658 }
659 mfp->bsp = bsp;
660 for (sip = bsp->id; sip != NULL; sip = sip->next)
661 {
662 i=(int)(sip->choice);
663 if (! is_genbank) /* don't change order for translated genbank */
664 {
665 if (order_array[i] < order)
666 order = order_array[i];
667 }
668 if (i == (int)SEQID_PATENT)
669 is_patent = TRUE;
670 else if (i == (int)SEQID_PRF)
671 organism = mfp->organism;
672 }
673 if (is_patent)
674 order = PATENT_ORDER;
675 mfp->order = order;
676 switch (mfp->tech)
677 {
678 case MI_TECH_est:
679 case MI_TECH_sts:
680 case MI_TECH_survey:
681 case MI_TECH_htgs_1:
682 case MI_TECH_htgs_2:
683 case MI_TECH_htgs_3:
684 organism = mfp->organism;
685 break;
686 default:
687 break;
688 }
689 if (! FastaIdX(bsp, mfp->buf, mfp->buflen, mfp->printid_general, mfp->seqloc))
690 return FALSE;
691 (*(mfp->myfunc))(bsp, FASTA_ID, mfp->buf, StringLen(mfp->buf), mfp->mydata);
692 if (! CreateDefLine(NULL, bsp, mfp->buf, mfp->buflen, mfp->tech, mfp->accession, organism))
693 return FALSE;
694 (*(mfp->myfunc))(bsp, FASTA_DEFLINE, mfp->buf, StringLen(mfp->buf), mfp->mydata);
695 if (mfp->formatdb && is_na) {
696 (*(mfp->myfunc))(bsp, FASTA_FORMATDB_AMB, mfp->buf, StringLen(mfp->buf), mfp->mydata);
697 }
698 else if(!mfp->no_sequence) {
699 if (!mfp->formatdb) {
700 if (is_na)
701 code = Seq_code_iupacna;
702 else
703 code = Seq_code_ncbieaa;
704 } else {
705 code = mfp->code;
706 }
707 if (repr == Seq_repr_virtual && (! mfp->do_virtual)) {
708 StringCpy (mfp->buf, "-");
709 (*(mfp->myfunc))(bsp, FASTA_SEQLINE, mfp->buf, StringLen(mfp->buf),
710 mfp->mydata);
711 (*(mfp->myfunc))(bsp, FASTA_EOS, mfp->buf, StringLen(mfp->buf),
712 mfp->mydata);
713 return TRUE;
714 }
715 spp = FastaSeqPortEx (bsp, is_na, mfp->do_virtual, code, mfp->seqloc);
716 if (spp == NULL) return FALSE;
717 while (FastaSeqLineEx(spp, mfp->buf, mfp->seqlen, is_na, mfp->do_virtual))
718 (*(mfp->myfunc))(bsp, FASTA_SEQLINE, mfp->buf, StringLen(mfp->buf),
719 mfp->mydata);
720 SeqPortFree(spp);
721 (*(mfp->myfunc))(bsp, FASTA_EOS, mfp->buf, StringLen(mfp->buf),
722 mfp->mydata);
723 }
724 return TRUE;
725 }
726
727 /*****************************************************************************
728 *
729 * BioseqFastaStream (bsp, fp, flags, linelen, blocklen, grouplen, do_defline)
730 *
731 * Rapid FASTA generator using SeqPortStream
732 *
733 *****************************************************************************/
734
735 typedef struct streamfsa {
736 FILE *fp;
737 ByteStorePtr bs;
738 Char buf [512];
739 Int2 idx;
740 Int2 lin;
741 Int2 blk;
742 Int2 grp;
743 Int2 linelen;
744 Int2 blocklen;
745 Int2 grouplen;
746 Int2 skip;
747 BIG_ID gi;
748 Int4 start;
749 Int4 seqpos;
750 Boolean seqspans;
751 } StreamFsa, PNTR StreamFsaPtr;
752
FsaStreamProc(CharPtr sequence,Pointer userdata)753 static void LIBCALLBACK FsaStreamProc (
754 CharPtr sequence,
755 Pointer userdata
756 )
757
758 {
759 Char ch;
760 StreamFsaPtr sfp;
761 Char spn [64];
762
763 if (StringHasNoText (sequence) || userdata == NULL) return;
764 sfp = (StreamFsaPtr) userdata;
765 ch = *sequence;
766 while (ch != '\0' && sfp->skip > 0) {
767 (sfp->skip)--;
768 (sfp->seqpos)++;
769 sequence++;
770 ch = *sequence;
771 }
772 while (ch != '\0') {
773 /* optionally separate blocks with space */
774 if (sfp->blk >= sfp->blocklen && sfp->blocklen > 0) {
775 sfp->buf [sfp->idx] = ' ';
776 (sfp->idx)++;
777 sfp->blk = 0;
778 }
779 /* save sequence character to buffer */
780 sfp->buf [sfp->idx] = ch;
781 (sfp->idx)++;
782 (sfp->lin)++;
783 (sfp->blk)++;
784 /* write sequence as soon as we have line of characters */
785 if (sfp->lin >= sfp->linelen) {
786 sfp->buf [sfp->idx] = '\0';
787 /* optionally separate groups with blank line */
788 if (sfp->grp >= sfp->grouplen && sfp->grouplen > 0) {
789 if (sfp->fp != NULL) {
790 fprintf (sfp->fp, "\n");
791 } else if (sfp->bs != NULL) {
792 BSWrite (sfp->bs, "\n", sizeof ("\n"));
793 }
794 sfp->grp = 0;
795 }
796 /* print actual sequence line here */
797 if (sfp->fp != NULL) {
798 if (sfp->seqspans) {
799 fprintf (sfp->fp, "<span class=\"ff_line\" id=\"gi_%ld_%ld\">", (long) sfp->gi, (long) (sfp->start + 1));
800 }
801 fprintf (sfp->fp, "%s", sfp->buf);
802 if (sfp->seqspans) {
803 fprintf (sfp->fp, "</span>");
804 }
805 fprintf (sfp->fp, "\n");
806 } else if (sfp->bs != NULL) {
807 if (sfp->seqspans) {
808 sprintf (spn, "<span class=\"ff_line\" id=\"gi_%ld_%ld\">", (long) sfp->gi, (long) (sfp->start + 1));
809 BSWrite (sfp->bs, spn, StringLen (spn));
810 }
811 BSWrite (sfp->bs, sfp->buf, StringLen (sfp->buf));
812 if (sfp->seqspans) {
813 BSWrite (sfp->bs, "</span>", sizeof ("</span>"));
814 }
815 BSWrite (sfp->bs, "\n", sizeof ("\n"));
816 }
817 sfp->start = sfp->seqpos + 1;
818 sfp->idx = 0;
819 sfp->lin = 0;
820 sfp->blk = 0;
821 (sfp->grp)++;
822 }
823 (sfp->seqpos)++;
824 sequence++;
825 ch = *sequence;
826 }
827 }
828
829 /* If Bioseq is a protein, does not have an accession,
830 * and is the only protein in the nuc-prot set or is part of
831 * a sorted protein file, use the nuc bioseq ID in the FASTA
832 * defline.
833 */
834
ChooseFastaID(BioseqPtr bsp,Boolean allow_mult)835 static SeqIdPtr ChooseFastaID (BioseqPtr bsp, Boolean allow_mult)
836
837 {
838 BioseqSetPtr bssp;
839 BioseqPtr nuc_bsp = NULL;
840 SeqIdPtr sip;
841 if (bsp == NULL) return NULL;
842 if (!ISA_aa(bsp->mol) || bsp->idx.parenttype != OBJ_BIOSEQSET || bsp->idx.parentptr == NULL) {
843 return bsp->id;
844 }
845 /* if protein sequence has an accession, do not use nucleotide ID */
846 sip = bsp->id;
847 while (sip != NULL) {
848 if (sip->choice == SEQID_GENBANK) {
849 return sip;
850 } else {
851 sip = sip->next;
852 }
853 }
854 bssp = (BioseqSetPtr) bsp->idx.parentptr;
855 if (bssp->_class != BioseqseqSet_class_nuc_prot /* not in nuc-prot set */
856 || bssp->seq_set == NULL /* no sequences in set - bad indexing */
857 || bssp->seq_set->next == NULL /* only one sequence in nuc-prot set, degenerate */
858 || (!allow_mult && bssp->seq_set->next->next != NULL) /* more than one protein in nuc-prot set */) {
859 return bsp->id;
860 }
861 if (IS_Bioseq (bssp->seq_set)) {
862 nuc_bsp = bssp->seq_set->data.ptrvalue;
863 } else if (IS_Bioseq_set (bssp->seq_set)) {
864 bssp = bssp->seq_set->data.ptrvalue;
865 if (bssp->_class == BioseqseqSet_class_segset
866 && bssp->seq_set != NULL
867 && IS_Bioseq (bssp->seq_set)) {
868 nuc_bsp = bssp->seq_set->data.ptrvalue;
869 }
870 }
871 if (nuc_bsp == NULL) {
872 return bsp->id;
873 } else {
874 return nuc_bsp->id;
875 }
876 }
877
AddSubSourceValuesToNucTitle(BioSourcePtr biop,CharPtr str)878 static void AddSubSourceValuesToNucTitle (
879 BioSourcePtr biop,
880 CharPtr str
881 )
882
883 {
884 Boolean needsQuotes;
885 CharPtr ssp_name;
886 SubSourcePtr ssp;
887 Char text [256];
888
889 if (biop == NULL || str == NULL) return;
890 ssp = biop->subtype;
891 while (ssp != NULL) {
892 StringCpy (text, "[");
893 ssp_name = GetSubsourceQualName (ssp->subtype);
894 if (StringHasNoText (ssp_name)) {
895 StringCat (text, "subsource");
896 } else {
897 StringCat (text, ssp_name);
898 }
899 StringToLower (text);
900 needsQuotes = FALSE;
901 if (StringChr (ssp->name, '=') != NULL ||
902 StringChr (ssp->name, '[') != NULL ||
903 StringChr (ssp->name, ']') != NULL) {
904 needsQuotes = TRUE;
905 }
906 StringCat (text, "=");
907 if (needsQuotes) {
908 StringCat (text, "\"");
909 }
910 StringCat (text, ssp->name);
911 if (needsQuotes) {
912 StringCat (text, "\"");
913 }
914 StringCat (text, "] ");
915 StringCat (str, text);
916 ssp = ssp->next;
917 }
918 }
919
AddOrgModValuesToNucTitle(BioSourcePtr biop,CharPtr str)920 static void AddOrgModValuesToNucTitle (
921 BioSourcePtr biop,
922 CharPtr str
923 )
924
925 {
926 CharPtr mod_name;
927 OrgModPtr mod;
928 Boolean needsQuotes;
929 Char text [256];
930
931 if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL || str == NULL) return;
932 mod = biop->org->orgname->mod;
933 while (mod != NULL) {
934 StringCpy (text, "[");
935 mod_name = GetOrgModQualName (mod->subtype);
936 StringCat (text, mod_name);
937 StringToLower (text);
938 needsQuotes = FALSE;
939 if (StringChr (mod->subname, '=') != NULL ||
940 StringChr (mod->subname, '[') != NULL ||
941 StringChr (mod->subname, ']') != NULL) {
942 needsQuotes = TRUE;
943 }
944 StringCat (text, "=");
945 if (needsQuotes) {
946 StringCat (text, "\"");
947 }
948 StringCat (text, mod->subname);
949 if (needsQuotes) {
950 StringCat (text, "\"");
951 }
952 StringCat (text, "] ");
953 StringCat (str, text);
954 mod = mod->next;
955 }
956 }
957
MakeNucleotideTitleInSequinStyle(BioseqPtr bsp)958 static CharPtr MakeNucleotideTitleInSequinStyle (
959 BioseqPtr bsp
960 )
961
962 {
963 BioSourcePtr biop;
964 MolInfoPtr mip;
965 Boolean needsQuotes;
966 OrgNamePtr onp;
967 OrgRefPtr orp;
968 SeqDescrPtr sdp;
969 CharPtr str;
970 Uint1 tech = 0;
971 Char text [256];
972 CharPtr tmp;
973
974 if (bsp == NULL) return NULL;
975 if (! ISA_na (bsp->mol)) return NULL;
976 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
977 if (sdp == NULL) return NULL;
978 biop = (BioSourcePtr) sdp->data.ptrvalue;
979 if (biop == NULL) return NULL;
980 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_molinfo, NULL);
981 if (sdp != NULL) {
982 mip = (MolInfoPtr) sdp->data.ptrvalue;
983 if (mip != NULL) {
984 switch (mip->tech) {
985 case MI_TECH_est :
986 case MI_TECH_sts :
987 case MI_TECH_survey :
988 case MI_TECH_htgs_1 :
989 case MI_TECH_htgs_2 :
990 case MI_TECH_htgs_3 :
991 case MI_TECH_fli_cdna :
992 case MI_TECH_htgs_0 :
993 case MI_TECH_htc :
994 case MI_TECH_wgs :
995 tech = mip->tech;
996 break;
997 default :
998 break;
999 }
1000 }
1001 }
1002 str = MemNew (5000);
1003
1004 orp = biop->org;
1005 if (orp != NULL) {
1006 needsQuotes = FALSE;
1007 if (StringChr (orp->taxname, '=') != NULL ||
1008 StringChr (orp->taxname, '[') != NULL ||
1009 StringChr (orp->taxname, ']') != NULL) {
1010 needsQuotes = TRUE;
1011 }
1012 StringCpy (text, "[organism=");
1013 if (needsQuotes) {
1014 StringCat (text, "\"");
1015 }
1016 StringCat (text, orp->taxname);
1017 if (needsQuotes) {
1018 StringCat (text, "\"");
1019 }
1020 StringCat (text, "] ");
1021 StringCat (str, text);
1022 }
1023
1024 AddSubSourceValuesToNucTitle (biop, str);
1025
1026 AddOrgModValuesToNucTitle (biop, str);
1027
1028 if (tech > 0) {
1029 StringCpy (text, "[tech=");
1030 StringCat (text, TechNameFromTech (tech));
1031 StringCat (text, "] ");
1032 StringCat (str, text);
1033 }
1034
1035 if (bsp->topology == TOPOLOGY_CIRCULAR) {
1036 StringCat (str, "[topology=circular] ");
1037 }
1038
1039 if (orp != NULL) {
1040 onp = orp->orgname;
1041 if (onp != NULL) {
1042 if (onp->gcode > 0) {
1043 sprintf (text, "[gcode=%d] ", (int) onp->gcode);
1044 StringCat (str, text);
1045 }
1046 if (onp->mgcode > 0) {
1047 sprintf (text, "[mgcode=%d] ", (int) onp->mgcode);
1048 StringCat (str, text);
1049 }
1050 if (onp->pgcode > 0) {
1051 sprintf (text, "[pgcode=%d] ", (int) onp->pgcode);
1052 StringCat (str, text);
1053 }
1054 }
1055 }
1056
1057 TrimSpacesAroundString (str);
1058 if (StringHasNoText (str)) {
1059 MemFree (str);
1060 return NULL;
1061 }
1062
1063 tmp = StringSave (str);
1064 MemFree (str);
1065
1066 return tmp;
1067 }
1068
BioseqFastaStreamInternal(BioseqPtr bsp,SeqLocPtr slp,SeqLitPtr lit,CharPtr str,FILE * fp,ByteStorePtr bs,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,Boolean substitute_ids,Boolean sorted_prot,Int2 skip)1069 static Int4 BioseqFastaStreamInternal (
1070 BioseqPtr bsp,
1071 SeqLocPtr slp,
1072 SeqLitPtr lit,
1073 CharPtr str,
1074 FILE *fp,
1075 ByteStorePtr bs,
1076 StreamFlgType flags,
1077 Int2 linelen,
1078 Int2 blocklen,
1079 Int2 grouplen,
1080 Boolean do_defline,
1081 Boolean substitute_ids,
1082 Boolean sorted_prot,
1083 Int2 skip
1084 )
1085
1086 {
1087 Char acc [41];
1088 SeqIdPtr accn = NULL;
1089 Char buf [4096];
1090 Char ch, ch1, ch2, ch3;
1091 Int4 count = 0;
1092 BIG_ID gi = -1;
1093 SeqIdPtr gpp = NULL;
1094 Char id [128];
1095 Uint1 id_format = PRINTID_FASTA_LONG;
1096 CharPtr original_id = NULL;
1097 CharPtr ptr;
1098 StreamFsa sf;
1099 SeqIdPtr sip = NULL;
1100 Char spn [64];
1101 CharPtr tmp;
1102
1103 if (bsp == NULL && slp == NULL && lit == NULL && str == NULL) return 0;
1104 if (fp == NULL && bs == NULL) return 0;
1105 if (bsp != NULL && bsp->repr == Seq_repr_virtual) return 0;
1106 if (linelen > 128) {
1107 linelen = 128;
1108 }
1109 if (linelen < 1) {
1110 linelen = 60;
1111 }
1112 if (blocklen > 100) {
1113 blocklen = 100;
1114 }
1115 if (blocklen < 1) {
1116 blocklen = 0;
1117 }
1118 if (grouplen > 100) {
1119 grouplen = 100;
1120 }
1121 if (grouplen < 1) {
1122 grouplen = 0;
1123 }
1124 acc [0] = '\0';
1125 MemSet ((Pointer) &sf, 0, sizeof (StreamFsa));
1126 sf.fp = fp;
1127 sf.bs = bs;
1128 sf.idx = 0;
1129 sf.lin = 0;
1130 sf.blk = 0;
1131 sf.grp = 0;
1132 sf.linelen = linelen;
1133 sf.blocklen = blocklen;
1134 sf.grouplen = grouplen;
1135 sf.skip = skip;
1136 sf.gi = 0;
1137 sf.start = 0;
1138 sf.seqpos = 0;
1139 sf.seqspans = (Boolean) ((flags & STREAM_HTML_SPANS) != 0);
1140 if (sf.seqspans) {
1141 if (bsp != NULL) {
1142 for (sip = bsp->id; sip != NULL; sip = sip->next) {
1143 switch (sip->choice) {
1144 case SEQID_GI :
1145 gi = sip->data.intvalue;
1146 break;
1147 case SEQID_GENBANK :
1148 case SEQID_EMBL :
1149 case SEQID_DDBJ :
1150 case SEQID_OTHER :
1151 accn = sip;
1152 break;
1153 case SEQID_PIR :
1154 case SEQID_SWISSPROT :
1155 case SEQID_PRF :
1156 case SEQID_PDB :
1157 accn = sip;
1158 break;
1159 case SEQID_TPG :
1160 case SEQID_TPE :
1161 case SEQID_TPD :
1162 accn = sip;
1163 break;
1164 case SEQID_GPIPE :
1165 /* should not override better accession */
1166 gpp = sip;
1167 break;
1168 default :
1169 break;
1170 }
1171 }
1172 } else if (slp != NULL) {
1173 /* PUBSEQ_OS will send a SeqInt with a chain of Seq-ids */
1174 for (sip = SeqLocId (slp); sip != NULL; sip = sip->next) {
1175 switch (sip->choice) {
1176 case SEQID_GI :
1177 gi = sip->data.intvalue;
1178 break;
1179 case SEQID_GENBANK :
1180 case SEQID_EMBL :
1181 case SEQID_DDBJ :
1182 case SEQID_OTHER :
1183 accn = sip;
1184 break;
1185 case SEQID_PIR :
1186 case SEQID_SWISSPROT :
1187 case SEQID_PRF :
1188 case SEQID_PDB :
1189 accn = sip;
1190 break;
1191 case SEQID_TPG :
1192 case SEQID_TPE :
1193 case SEQID_TPD :
1194 accn = sip;
1195 break;
1196 case SEQID_GPIPE :
1197 /* should not override better accession */
1198 gpp = sip;
1199 break;
1200 default :
1201 break;
1202 }
1203 }
1204 if (sip != NULL && sip->choice == SEQID_GI) {
1205 sf.gi = sip->data.intvalue;
1206 }
1207 }
1208 if (gi > 0) {
1209 sf.gi = gi;
1210 }
1211 if (accn == NULL) {
1212 accn = gpp;
1213 }
1214 if (accn != NULL) {
1215 SeqIdWrite (accn, acc, PRINTID_TEXTID_ACC_ONLY, sizeof (acc) - 1);
1216
1217 if (accn->choice == SEQID_PDB) {
1218 ptr = StringChr (acc, '_');
1219 if (ptr != NULL) {
1220 ch1 = ptr [1];
1221 if (ch1 != '\0') {
1222 ch2 = ptr [2];
1223 if (ch2 != '\0') {
1224 ch3 = ptr [3];
1225 if (ch3 == '\0') {
1226 if (ch1 == ch2) {
1227 if (IS_UPPER (ch1)) {
1228 ptr [1] = TO_LOWER (ch1);
1229 ptr [2] = '\0';
1230 }
1231 }
1232 }
1233 }
1234 }
1235 }
1236 }
1237 }
1238 }
1239 if (do_defline) {
1240 id [0] = '\0';
1241 if (ShouldUseOriginalID (bsp)) {
1242 original_id = FastaGetOriginalId (bsp);
1243 }
1244 if (substitute_ids) {
1245 sip = ChooseFastaID (bsp, sorted_prot);
1246 } else if (bsp != NULL) {
1247 sip = bsp->id;
1248 }
1249 if ((flags & STREAM_ALL_FASTA_IDS) != 0) {
1250 id_format = PRINTID_FASTA_ALL;
1251 }
1252 if (original_id != NULL && StringLen (original_id) + 5 < sizeof (id)) {
1253 sprintf (id, "lcl|%s", original_id);
1254 } else {
1255 SeqIdWrite (sip, id, id_format, sizeof (id) - 1);
1256 }
1257 /* no longer need to do feature indexing if title not present to speed up creation */
1258 /*
1259 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_title, NULL);
1260 if (sdp == NULL) {
1261 entityID = ObjMgrGetEntityIDForPointer (bsp);
1262 if (! SeqMgrFeaturesAreIndexed (entityID)) {
1263 SeqMgrIndexFeatures (entityID, NULL);
1264 }
1265 }
1266 */
1267 buf [0] = '\0';
1268 if ((flags & STREAM_TAGGED_DEFLINE) != 0) {
1269 str = MakeNucleotideTitleInSequinStyle (bsp);
1270 StringNCpy_0 (buf, str, sizeof (buf));
1271 MemFree (str);
1272 } else {
1273 NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE);
1274 }
1275 tmp = buf;
1276 ch = *tmp;
1277 if (ch == '>') {
1278 *tmp = '_';
1279 }
1280 /*
1281 tmp = buf;
1282 ch = *tmp;
1283 while (ch != '\0') {
1284 if (ch == '>') {
1285 *tmp = '_';
1286 }
1287 tmp++;
1288 ch = *tmp;
1289 }
1290 */
1291 if (sf.fp != NULL) {
1292 fprintf (fp, ">%s %s\n", id, buf);
1293 } else if (sf.bs != NULL) {
1294 BSWrite (sf.bs, ">", sizeof (">"));
1295 BSWrite (sf.bs, id, StringLen (id));
1296 BSWrite (sf.bs, " ", sizeof (" "));
1297 BSWrite (sf.bs, buf, StringLen (buf));
1298 BSWrite (sf.bs, "\n", sizeof ("\n"));
1299 }
1300 }
1301 if (bsp != NULL) {
1302 count = SeqPortStream (bsp, flags, (Pointer) &sf, FsaStreamProc);
1303 } else if (slp != NULL) {
1304 count = SeqPortStreamLoc (slp, flags, (Pointer) &sf, FsaStreamProc);
1305 } else if (lit != NULL) {
1306 count = SeqPortStreamLit (lit, flags, (Pointer) &sf, FsaStreamProc);
1307 } else if (str != NULL) {
1308 count = StringLen (str);
1309 FsaStreamProc (str, (Pointer) &sf);
1310 }
1311 /* print any remaining sequence */
1312 if (sf.lin > 0) {
1313 sf.buf [sf.idx] = '\0';
1314 if (sf.grp >= sf.grouplen && sf.grouplen > 0) {
1315 if (sf.fp != NULL) {
1316 fprintf (fp, "\n");
1317 } else if (sf.bs != NULL) {
1318 BSWrite (sf.bs, "\n", sizeof ("\n"));
1319 }
1320 }
1321 if (sf.fp != NULL) {
1322 if (sf.seqspans) {
1323 fprintf (sf.fp, "<span class=\"ff_line\" id=\"gi_%ld_%ld\">", (long) sf.gi, (long) (sf.start + 1));
1324 }
1325 fprintf (sf.fp, "%s", sf.buf);
1326 if (sf.seqspans) {
1327 fprintf (sf.fp, "</span>");
1328 }
1329 fprintf (sf.fp, "\n");
1330 if (sf.seqspans) {
1331 fprintf (sf.fp, "<script type=\"text/javascript\">");
1332 fprintf (sf.fp, "if (typeof(oData) == \"undefined\") oData = []; ");
1333 fprintf (sf.fp, "oData.push({gi:%ld,acc:\"%s\"})", (long) sf.gi, acc);
1334 fprintf (sf.fp, "</script>\n");
1335 }
1336 } else if (sf.bs != NULL) {
1337 if (sf.seqspans) {
1338 sprintf (spn, "<span class=\"ff_line\" id=\"gi_%ld_%ld\">", (long) sf.gi, (long) (sf.start + 1));
1339 BSWrite (sf.bs, spn, StringLen (spn));
1340 }
1341 BSWrite (sf.bs, sf.buf, StringLen (sf.buf));
1342 if (sf.seqspans) {
1343 BSWrite (sf.bs, "</span>", sizeof ("</span>"));
1344 }
1345 BSWrite (sf.bs, "\n", sizeof ("\n"));
1346 if (sf.seqspans) {
1347 sprintf (spn, "<script type=\"text/javascript\">");
1348 BSWrite (sf.bs, spn, StringLen (spn));
1349 sprintf (spn, "if (typeof(oData) == \"undefined\") oData = []; ");
1350 BSWrite (sf.bs, spn, StringLen (spn));
1351 sprintf (spn, "oData.push({gi:%ld,acc:\"%s\"})", (long) sf.gi, acc);
1352 BSWrite (sf.bs, spn, StringLen (spn));
1353 sprintf (spn, "</script>\n");
1354 BSWrite (sf.bs, spn, StringLen (spn));
1355 }
1356 }
1357 }
1358 return count;
1359 }
1360
BioseqFastaStream(BioseqPtr bsp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline)1361 NLM_EXTERN Int4 BioseqFastaStream (
1362 BioseqPtr bsp,
1363 FILE *fp,
1364 StreamFlgType flags,
1365 Int2 linelen,
1366 Int2 blocklen,
1367 Int2 grouplen,
1368 Boolean do_defline
1369 )
1370
1371 {
1372 return BioseqFastaStreamInternal (bsp, NULL, NULL, NULL, fp, NULL, flags,
1373 linelen, blocklen, grouplen,
1374 do_defline, FALSE, FALSE, 0);
1375 }
1376
BioseqFastaStreamEx(BioseqPtr bsp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,Boolean substitute_ids,Boolean sorted_protein)1377 NLM_EXTERN Int4 BioseqFastaStreamEx (
1378 BioseqPtr bsp,
1379 FILE *fp,
1380 StreamFlgType flags,
1381 Int2 linelen,
1382 Int2 blocklen,
1383 Int2 grouplen,
1384 Boolean do_defline,
1385 Boolean substitute_ids,
1386 Boolean sorted_protein
1387 )
1388
1389 {
1390 return BioseqFastaStreamInternal (bsp, NULL, NULL, NULL, fp, NULL, flags,
1391 linelen, blocklen, grouplen,
1392 do_defline, substitute_ids, sorted_protein, 0);
1393 }
1394
BioseqFastaMemStream(BioseqPtr bsp,ByteStorePtr bs,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline)1395 NLM_EXTERN Int4 BioseqFastaMemStream (
1396 BioseqPtr bsp,
1397 ByteStorePtr bs,
1398 StreamFlgType flags,
1399 Int2 linelen,
1400 Int2 blocklen,
1401 Int2 grouplen,
1402 Boolean do_defline
1403 )
1404
1405 {
1406 return BioseqFastaStreamInternal (bsp, NULL, NULL, NULL, NULL, bs, flags,
1407 linelen, blocklen, grouplen,
1408 do_defline, FALSE, FALSE, 0);
1409 }
1410
SeqLocFastaStream(SeqLocPtr slp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen)1411 NLM_EXTERN Int4 SeqLocFastaStream (
1412 SeqLocPtr slp,
1413 FILE *fp,
1414 StreamFlgType flags,
1415 Int2 linelen,
1416 Int2 blocklen,
1417 Int2 grouplen
1418 )
1419
1420 {
1421 if (slp == NULL || fp == NULL) return 0;
1422
1423 return BioseqFastaStreamInternal (NULL, slp, NULL, NULL, fp, NULL, flags,
1424 linelen, blocklen, grouplen,
1425 FALSE, FALSE, FALSE, 0);
1426 }
1427
SeqLitFastaStream(SeqLitPtr lit,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen)1428 NLM_EXTERN Int4 SeqLitFastaStream (
1429 SeqLitPtr lit,
1430 FILE *fp,
1431 StreamFlgType flags,
1432 Int2 linelen,
1433 Int2 blocklen,
1434 Int2 grouplen
1435 )
1436
1437 {
1438 if (lit == NULL || fp == NULL) return 0;
1439
1440 return BioseqFastaStreamInternal (NULL, NULL, lit, NULL, fp, NULL, flags,
1441 linelen, blocklen, grouplen,
1442 FALSE, FALSE, FALSE, 0);
1443 }
1444
DoSpecialDefline(SeqFeatPtr sfp,FILE * fp,CdRegionPtr crp,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1445 static void DoSpecialDefline (
1446 SeqFeatPtr sfp,
1447 FILE *fp,
1448 CdRegionPtr crp,
1449 CharPtr idSuffix,
1450 SeqLocPtr mappedloc,
1451 BioseqPtr parentbsp
1452 )
1453
1454 {
1455 BioseqPtr bsp = NULL;
1456 Char buf [512];
1457 SeqFeatPtr cds;
1458 SeqMgrFeatContext cdscontext;
1459 Boolean do_defline = TRUE;
1460 Uint2 entityID;
1461 SeqFeatPtr gene = NULL;
1462 SeqMgrFeatContext genecontext;
1463 BIG_ID gi;
1464 GeneRefPtr grp;
1465 IntAsn2gbJob iaj;
1466 SeqLocPtr loc;
1467 Boolean partial5;
1468 Boolean partial3;
1469 BioseqPtr prod;
1470 CharPtr ptr;
1471 SeqIdPtr sip;
1472 CharPtr str;
1473 Char tmp [64];
1474 Boolean unlock = FALSE;
1475
1476 if (sfp == NULL || fp == NULL || crp == NULL) return;
1477
1478 MemSet ((Pointer) &genecontext, 0, sizeof (SeqMgrFeatContext));
1479 MemSet ((Pointer) &cdscontext, 0, sizeof (SeqMgrFeatContext));
1480
1481 if (do_defline) {
1482 bsp = BioseqFindFromSeqLoc (sfp->location);
1483 if (bsp == NULL) {
1484 sip = SeqLocId (sfp->location);
1485 if (sip == NULL) {
1486 loc = SeqLocFindNext (sfp->location, NULL);
1487 if (loc != NULL) {
1488 sip = SeqLocId (loc);
1489 }
1490 }
1491 if (sip != NULL) {
1492 bsp = BioseqLockById (sip);
1493 if (bsp != NULL) {
1494 unlock = TRUE;
1495 }
1496 }
1497 }
1498 if (bsp == NULL) {
1499 do_defline = FALSE;
1500 StringCpy (buf, "lcl|");
1501 sip = SeqLocId (sfp->location);
1502 if (sip != NULL) {
1503 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1504 StringCat (buf, tmp);
1505 }
1506 if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1507 StringCat (buf, idSuffix);
1508 }
1509 FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1510 StringCpy (buf, "?");
1511 FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1512 fflush (fp);
1513 }
1514 }
1515
1516 if (do_defline && bsp != NULL) {
1517 entityID = ObjMgrGetEntityIDForPointer (bsp);
1518 if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
1519 SeqMgrIndexFeatures (entityID, NULL);
1520 }
1521 cds = SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &cdscontext);
1522 if (sfp != cds) {
1523 do_defline = FALSE;
1524 StringCpy (buf, "lcl|");
1525 sip = SeqIdFindWorst (bsp->id);
1526 if (sip != NULL) {
1527 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1528 StringCat (buf, tmp);
1529 }
1530 if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1531 StringCat (buf, idSuffix);
1532 }
1533 FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1534 StringCpy (buf, "??");
1535 FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1536 fflush (fp);
1537 }
1538 }
1539
1540 if (do_defline) {
1541 entityID = ObjMgrGetEntityIDForPointer (bsp);
1542 if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
1543 SeqMgrIndexFeatures (entityID, NULL);
1544 }
1545
1546 CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
1547
1548 grp = SeqMgrGetGeneXref (sfp);
1549 if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) {
1550 gene = SeqMgrGetOverlappingGene (sfp->location, &genecontext);
1551 }
1552
1553 MemSet ((Pointer) &iaj, 0, sizeof (IntAsn2gbJob));
1554 iaj.flags.iupacaaOnly = FALSE;
1555 iaj.relModeError = FALSE;
1556
1557 if (parentbsp == NULL) {
1558 parentbsp = bsp;
1559 }
1560
1561 StringCpy (buf, "lcl|");
1562 sip = SeqIdFindWorst (parentbsp->id);
1563 if (sip != NULL) {
1564 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1565 StringCat (buf, tmp);
1566 }
1567 if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1568 StringCat (buf, idSuffix);
1569 }
1570
1571 FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1572
1573 buf [0] = '\0';
1574 if (StringDoesHaveText (genecontext.label)) {
1575 StringCat (buf, "[gene=");
1576 StringCat (buf, genecontext.label);
1577 StringCat (buf, "] ");
1578 }
1579 if (StringDoesHaveText (cdscontext.label)) {
1580 StringCat (buf, "[protein=");
1581 StringCat (buf, cdscontext.label);
1582 StringCat (buf, "] ");
1583 }
1584 if (crp->frame == 2) {
1585 StringCat (buf, "[frame=2] ");
1586 } else if (crp->frame == 3) {
1587 StringCat (buf, "[frame=3] ");
1588 }
1589 if (partial5 && partial3) {
1590 StringCat (buf, "[partial=5',3'] ");
1591 } else if (partial5) {
1592 StringCat (buf, "[partial=5'] ");
1593 } else if (partial3) {
1594 StringCat (buf, "[partial=3'] ");
1595 }
1596 if (sfp->product != NULL) {
1597 tmp [0] = '\0';
1598 sip = SeqLocId (sfp->product);
1599 if (sip != NULL && sip->choice == SEQID_GI) {
1600 prod = BioseqFind (sip);
1601 if (prod != NULL) {
1602 sip = SeqIdFindWorst (prod->id);
1603 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1604 } else {
1605 gi = sip->data.intvalue;
1606 sip = GetSeqIdForGI (gi);
1607 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp));
1608 SeqIdFree (sip);
1609 }
1610 } else if (sip != NULL) {
1611 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp));
1612 }
1613 if (StringDoesHaveText (tmp)) {
1614 StringCat (buf, "[protein_id=");
1615 StringCat (buf, tmp);
1616 StringCat (buf, "] ");
1617 }
1618 }
1619 if (mappedloc == NULL) {
1620 mappedloc = sfp->location;
1621 }
1622
1623 str = FFFlatLoc (&iaj, bsp, mappedloc, FALSE, FALSE);
1624
1625 ptr = (CharPtr) MemNew ((StringLen (buf) + StringLen (str) + 30) * sizeof (Char));
1626 if (ptr != NULL) {
1627 StringCpy (ptr, buf);
1628 if (str != NULL) {
1629 StringCat (ptr, "[location=");
1630 StringCat (ptr, str);
1631 StringCat (ptr, "] ");
1632 }
1633 TrimSpacesAroundString (ptr);
1634
1635 FastaFileFunc (bsp, FASTA_DEFLINE, ptr, StringLen (ptr), (Pointer) fp);
1636
1637 MemFree (ptr);
1638 }
1639
1640 MemFree (str);
1641
1642 fflush (fp);
1643
1644 if (unlock) {
1645 BioseqUnlock (bsp);
1646 }
1647 }
1648 }
1649
CdRegionFastaStreamEx(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1650 NLM_EXTERN Int4 CdRegionFastaStreamEx (
1651 SeqFeatPtr sfp,
1652 FILE *fp,
1653 StreamFlgType flags,
1654 Int2 linelen,
1655 Int2 blocklen,
1656 Int2 grouplen,
1657 Boolean do_defline,
1658 CharPtr idSuffix,
1659 SeqLocPtr mappedloc,
1660 BioseqPtr parentbsp
1661 )
1662
1663 {
1664 CdRegionPtr crp;
1665 Int2 skip = 0;
1666
1667 if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return 0;
1668 if (fp == NULL) return 0;
1669 crp = (CdRegionPtr) sfp->data.value.ptrvalue;
1670 if (crp == NULL) return 0;
1671
1672 if (do_defline) {
1673 DoSpecialDefline (sfp, fp, crp, idSuffix, mappedloc, parentbsp);
1674 }
1675
1676 if (crp->frame == 2) {
1677 skip = 1;
1678 } else if (crp->frame == 3) {
1679 skip = 2;
1680 }
1681
1682 return BioseqFastaStreamInternal (NULL, sfp->location, NULL, NULL, fp, NULL, flags,
1683 linelen, blocklen, grouplen,
1684 FALSE, FALSE, FALSE, skip);
1685 }
1686
CdRegionFastaStream(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix)1687 NLM_EXTERN Int4 CdRegionFastaStream (
1688 SeqFeatPtr sfp,
1689 FILE *fp,
1690 StreamFlgType flags,
1691 Int2 linelen,
1692 Int2 blocklen,
1693 Int2 grouplen,
1694 Boolean do_defline,
1695 CharPtr idSuffix
1696 )
1697
1698 {
1699 return CdRegionFastaStreamEx (sfp, fp, flags, linelen, blocklen, grouplen,
1700 do_defline, idSuffix, NULL, NULL);
1701 }
1702
TranslationFastaStreamEx(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1703 NLM_EXTERN Int4 TranslationFastaStreamEx (
1704 SeqFeatPtr sfp,
1705 FILE *fp,
1706 StreamFlgType flags,
1707 Int2 linelen,
1708 Int2 blocklen,
1709 Int2 grouplen,
1710 Boolean do_defline,
1711 CharPtr idSuffix,
1712 SeqLocPtr mappedloc,
1713 BioseqPtr parentbsp
1714 )
1715
1716 {
1717 ByteStorePtr bs;
1718 Char ch;
1719 Int4 count = 0;
1720 CdRegionPtr crp;
1721 size_t prtlen;
1722 CharPtr ptr;
1723 CharPtr str;
1724
1725 if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return 0;
1726 if (fp == NULL) return 0;
1727 crp = (CdRegionPtr) sfp->data.value.ptrvalue;
1728 if (crp == NULL) return 0;
1729
1730 if (do_defline) {
1731 DoSpecialDefline (sfp, fp, crp, idSuffix, mappedloc, parentbsp);
1732 }
1733
1734 str = NULL;
1735 bs = ProteinFromCdRegionEx (sfp, TRUE, FALSE);
1736 str = BSMerge (bs, NULL);
1737 bs = BSFree (bs);
1738
1739 if (str != NULL) {
1740 ptr = str;
1741 ch = *ptr;
1742 while (ch != '\0') {
1743 *ptr = TO_UPPER (ch);
1744 ptr++;
1745 ch = *ptr;
1746 }
1747 prtlen = StringLen (str);
1748 if (prtlen > 1) {
1749 if (str [prtlen - 1] == '*') {
1750 str [prtlen - 1] = '\0';
1751 }
1752 }
1753 }
1754
1755 count = BioseqFastaStreamInternal (NULL, NULL, NULL, str, fp, NULL, flags,
1756 linelen, blocklen, grouplen,
1757 FALSE, FALSE, FALSE, 0);
1758
1759 MemFree (str);
1760
1761 return count;
1762 }
1763
TranslationFastaStream(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix)1764 NLM_EXTERN Int4 TranslationFastaStream (
1765 SeqFeatPtr sfp,
1766 FILE *fp,
1767 StreamFlgType flags,
1768 Int2 linelen,
1769 Int2 blocklen,
1770 Int2 grouplen,
1771 Boolean do_defline,
1772 CharPtr idSuffix
1773 )
1774
1775 {
1776 return TranslationFastaStreamEx (sfp, fp, flags, linelen, blocklen, grouplen,
1777 do_defline, idSuffix, NULL, NULL);
1778 }
1779
DoGeneDefline(SeqFeatPtr sfp,FILE * fp,GeneRefPtr grp,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1780 static void DoGeneDefline (
1781 SeqFeatPtr sfp,
1782 FILE *fp,
1783 GeneRefPtr grp,
1784 CharPtr idSuffix,
1785 SeqLocPtr mappedloc,
1786 BioseqPtr parentbsp
1787 )
1788
1789 {
1790 BioseqPtr bsp = NULL;
1791 Char buf [512];
1792 Boolean do_defline = TRUE;
1793 Uint2 entityID;
1794 SeqMgrFeatContext genecontext;
1795 IntAsn2gbJob iaj;
1796 Boolean partial5;
1797 Boolean partial3;
1798 SeqIdPtr sip;
1799 CharPtr str;
1800 Char tmp [64];
1801 Boolean unlock = FALSE;
1802
1803 if (sfp == NULL || fp == NULL || grp == NULL) return;
1804 if (sfp == NULL || fp == NULL || sfp->data.choice != SEQFEAT_GENE) return;
1805 grp = (GeneRefPtr) sfp->data.value.ptrvalue;
1806 if (grp == NULL) return;
1807
1808 if (do_defline) {
1809 bsp = BioseqFindFromSeqLoc (sfp->location);
1810 if (bsp == NULL) {
1811 sip = SeqLocId (sfp->location);
1812 if (sip != NULL) {
1813 bsp = BioseqLockById (sip);
1814 if (bsp != NULL) {
1815 unlock = TRUE;
1816 }
1817 }
1818 }
1819 if (bsp == NULL) {
1820 do_defline = FALSE;
1821 StringCpy (buf, "lcl|");
1822 sip = SeqLocId (sfp->location);
1823 if (sip != NULL) {
1824 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1825 StringCat (buf, tmp);
1826 }
1827 if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1828 StringCat (buf, idSuffix);
1829 }
1830 FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1831 StringCpy (buf, "?");
1832 FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1833 fflush (fp);
1834 }
1835 }
1836
1837 if (do_defline && bsp != NULL) {
1838 if (sfp != SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &genecontext)) {
1839 do_defline = FALSE;
1840 StringCpy (buf, "lcl|");
1841 sip = SeqIdFindWorst (bsp->id);
1842 if (sip != NULL) {
1843 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1844 StringCat (buf, tmp);
1845 }
1846 if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1847 StringCat (buf, idSuffix);
1848 }
1849 FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1850 StringCpy (buf, "??");
1851 FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1852 fflush (fp);
1853 }
1854 }
1855
1856 if (do_defline) {
1857 entityID = ObjMgrGetEntityIDForPointer (bsp);
1858 if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
1859 SeqMgrIndexFeatures (entityID, NULL);
1860 }
1861
1862 CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
1863
1864 MemSet ((Pointer) &iaj, 0, sizeof (IntAsn2gbJob));
1865 iaj.flags.iupacaaOnly = FALSE;
1866 iaj.relModeError = FALSE;
1867
1868 if (parentbsp == NULL) {
1869 parentbsp = bsp;
1870 }
1871
1872 StringCpy (buf, "lcl|");
1873 sip = SeqIdFindWorst (parentbsp->id);
1874 if (sip != NULL) {
1875 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp) - 1);
1876 StringCat (buf, tmp);
1877 }
1878 if (StringDoesHaveText (idSuffix) && StringLen (idSuffix) < 200) {
1879 StringCat (buf, idSuffix);
1880 }
1881
1882 FastaFileFunc (bsp, FASTA_ID, buf, sizeof (buf), (Pointer) fp);
1883
1884 buf [0] = '\0';
1885 if (StringDoesHaveText (grp->locus)) {
1886 StringCat (buf, "[gene=");
1887 StringCat (buf, grp->locus);
1888 StringCat (buf, "] ");
1889 }
1890 if (StringDoesHaveText (grp->locus_tag)) {
1891 StringCat (buf, "[locus_tag=");
1892 StringCat (buf, grp->locus_tag);
1893 StringCat (buf, "] ");
1894 }
1895 if (StringLen (buf) == 0 && StringDoesHaveText (genecontext.label)) {
1896 StringCat (buf, "[gene=");
1897 StringCat (buf, genecontext.label);
1898 StringCat (buf, "] ");
1899 }
1900 if (mappedloc == NULL) {
1901 mappedloc = sfp->location;
1902 }
1903 str = FFFlatLoc (&iaj, bsp, mappedloc, FALSE, FALSE);
1904 if (str != NULL && StringLen (str) + StringLen (buf) < sizeof (buf) - 10) {
1905 StringCat (buf, "[location=");
1906 StringCat (buf, str);
1907 StringCat (buf, "] ");
1908 MemFree (str);
1909 }
1910 TrimSpacesAroundString (buf);
1911
1912 FastaFileFunc (bsp, FASTA_DEFLINE, buf, sizeof (buf), (Pointer) fp);
1913
1914 fflush (fp);
1915
1916 if (unlock) {
1917 BioseqUnlock (bsp);
1918 }
1919 }
1920 }
1921
GeneFastaStreamEx(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix,SeqLocPtr mappedloc,BioseqPtr parentbsp)1922 NLM_EXTERN Int4 GeneFastaStreamEx (
1923 SeqFeatPtr sfp,
1924 FILE *fp,
1925 StreamFlgType flags,
1926 Int2 linelen,
1927 Int2 blocklen,
1928 Int2 grouplen,
1929 Boolean do_defline,
1930 CharPtr idSuffix,
1931 SeqLocPtr mappedloc,
1932 BioseqPtr parentbsp
1933 )
1934
1935 {
1936 GeneRefPtr grp;
1937
1938 if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE) return 0;
1939 if (fp == NULL) return 0;
1940 grp = (GeneRefPtr) sfp->data.value.ptrvalue;
1941 if (grp == NULL) return 0;
1942
1943 if (do_defline) {
1944 DoGeneDefline (sfp, fp, grp, idSuffix, mappedloc, parentbsp);
1945 }
1946
1947 return BioseqFastaStreamInternal (NULL, sfp->location, NULL, NULL, fp, NULL, flags,
1948 linelen, blocklen, grouplen,
1949 FALSE, FALSE, FALSE, 0);
1950 }
1951
GeneFastaStream(SeqFeatPtr sfp,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_defline,CharPtr idSuffix)1952 NLM_EXTERN Int4 GeneFastaStream (
1953 SeqFeatPtr sfp,
1954 FILE *fp,
1955 StreamFlgType flags,
1956 Int2 linelen,
1957 Int2 blocklen,
1958 Int2 grouplen,
1959 Boolean do_defline,
1960 CharPtr idSuffix
1961 )
1962
1963 {
1964 return GeneFastaStreamEx (sfp, fp, flags, linelen, blocklen, grouplen,
1965 do_defline, idSuffix, NULL, NULL);
1966 }
1967
1968 /*****************************************************************************
1969 *
1970 * SeqEntryFastaStream (bsp, fp, flags, linelen, blocklen, grouplen,
1971 * do_na, do_aa, master_style)
1972 *
1973 * Rapid FASTA generator on ASN.1 record including GenBank release set
1974 *
1975 *****************************************************************************/
1976
1977 typedef struct fastastreamdata {
1978 FILE *fp;
1979 StreamFlgType flags;
1980 Int2 linelen;
1981 Int2 blocklen;
1982 Int2 grouplen;
1983 Boolean do_na;
1984 Boolean do_aa;
1985 Boolean master_style;
1986 Boolean failed;
1987 Int4 count;
1988 Boolean substitute_ids;
1989 Boolean sorted_prot;
1990 } FastaStreamData, PNTR FastaStreamPtr;
1991
GetSegParts(BioseqPtr bsp)1992 static BioseqSetPtr GetSegParts (
1993 BioseqPtr bsp
1994 )
1995
1996 {
1997 BioseqSetPtr bssp;
1998 SeqEntryPtr sep;
1999 if (bsp == NULL || bsp->repr != Seq_repr_seg) return NULL;
2000 sep = bsp->seqentry;
2001 if (sep == NULL) return NULL;
2002 sep = sep->next;
2003 if (sep == NULL || (! IS_Bioseq_set (sep))) return NULL;
2004 bssp = (BioseqSetPtr) sep->data.ptrvalue;
2005 if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) return bssp;
2006 return NULL;
2007 }
2008
FastaOneBioseq(BioseqPtr bsp,Pointer userdata)2009 static void FastaOneBioseq (
2010 BioseqPtr bsp,
2011 Pointer userdata
2012 )
2013
2014 { Int4 count;
2015 FastaStreamPtr fsp;
2016 BioseqSetPtr parts;
2017 if (bsp == NULL) return;
2018 fsp = (FastaStreamPtr) userdata;
2019 if (fsp == NULL) return;
2020 /* return if molecule not right for format */
2021 if (ISA_na (bsp->mol)) {
2022 if (! fsp->do_na) return;
2023 } else if (ISA_aa (bsp->mol)) {
2024 if (! fsp->do_aa) return;
2025 }
2026 if (bsp->repr == Seq_repr_seg && (! fsp->master_style)) {
2027 /* if bsp followed by parts set, recurse to make FASTA from individual parts */
2028 parts = GetSegParts (bsp);
2029 if (parts != NULL) {
2030 VisitBioseqsInSet (parts, (Pointer) fsp, FastaOneBioseq);
2031 return;
2032 }
2033 }
2034 if (bsp->repr == Seq_repr_raw ||
2035 bsp->repr == Seq_repr_seg ||
2036 bsp->repr == Seq_repr_const ||
2037 bsp->repr == Seq_repr_delta ||
2038 bsp->repr == Seq_repr_ref ||
2039 bsp->repr == Seq_repr_virtual) {
2040 count = BioseqFastaStreamEx (bsp, fsp->fp, fsp->flags, fsp->linelen, fsp->blocklen, fsp->grouplen,
2041 TRUE, fsp->substitute_ids, fsp->sorted_prot);
2042 if (count < 0) {
2043 fsp->failed = TRUE;
2044 fsp->count -= count;
2045 } else {
2046 fsp->count += count;
2047 }
2048 }
2049 }
2050
SeqEntryFastaStreamEx(SeqEntryPtr sep,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_na,Boolean do_aa,Boolean master_style,Boolean substitute_ids,Boolean sorted_prot)2051 NLM_EXTERN Int4 SeqEntryFastaStreamEx (
2052 SeqEntryPtr sep,
2053 FILE *fp,
2054 StreamFlgType flags,
2055 Int2 linelen,
2056 Int2 blocklen,
2057 Int2 grouplen,
2058 Boolean do_na,
2059 Boolean do_aa,
2060 Boolean master_style,
2061 Boolean substitute_ids,
2062 Boolean sorted_prot
2063 )
2064
2065 { BioseqPtr bsp = NULL;
2066 BioseqSetPtr bssp = NULL;
2067 Uint2 entityID = 0;
2068 FastaStreamData fsd;
2069 SeqEntryPtr oldscope;
2070 if (sep == NULL || fp == NULL) return 0;
2071 if (IS_Bioseq (sep)) {
2072 bsp = (BioseqPtr) sep->data.ptrvalue;
2073 entityID = ObjMgrGetEntityIDForPointer (bsp);
2074 } else if (IS_Bioseq_set (sep)) {
2075 bssp = (BioseqSetPtr) sep->data.ptrvalue;
2076 entityID = ObjMgrGetEntityIDForPointer (bssp);
2077 }
2078 if (entityID == 0) return 0;
2079 /* AssignIDs sets bsp->seqentry so GetSegParts can work */
2080 AssignIDsInEntity (entityID, 0, NULL);
2081 fsd.fp = fp;
2082 fsd.flags = flags;
2083 fsd.linelen = linelen;
2084 fsd.blocklen = blocklen;
2085 fsd.grouplen = grouplen;
2086 fsd.do_na = do_na;
2087 fsd.do_aa = do_aa;
2088 fsd.master_style = master_style;
2089 fsd.failed = FALSE;
2090 fsd.count = 0;
2091 fsd.substitute_ids = substitute_ids;
2092 fsd.sorted_prot = sorted_prot;
2093 oldscope = SeqEntrySetScope (sep);
2094 if (bssp != NULL) {
2095 /* handle all components of a pop/phy/mut/eco set */
2096 sep = SeqMgrGetSeqEntryForData (bssp);
2097 VisitSequencesInSep (sep, (Pointer) &fsd, VISIT_MAINS, FastaOneBioseq);
2098 } else {
2099 /* handle single bioseq, which may be segmented or a local part */
2100 FastaOneBioseq (bsp, (Pointer) &fsd);
2101 }
2102 SeqEntrySetScope (oldscope);
2103 if (fsd.failed) {
2104 return -fsd.count;
2105 }
2106 return fsd.count;
2107 }
2108
SeqEntryFastaStream(SeqEntryPtr sep,FILE * fp,StreamFlgType flags,Int2 linelen,Int2 blocklen,Int2 grouplen,Boolean do_na,Boolean do_aa,Boolean master_style)2109 NLM_EXTERN Int4 SeqEntryFastaStream (
2110 SeqEntryPtr sep,
2111 FILE *fp,
2112 StreamFlgType flags,
2113 Int2 linelen,
2114 Int2 blocklen,
2115 Int2 grouplen,
2116 Boolean do_na,
2117 Boolean do_aa,
2118 Boolean master_style
2119 )
2120
2121 { return SeqEntryFastaStreamEx (sep, fp, flags, linelen, blocklen, grouplen, do_na, do_aa, master_style, FALSE, FALSE);
2122 }
2123
MakeFastaStreamIdSuffix(SeqFeatPtr sfp,Uint4 idx,CharPtr prefix,CharPtr buf,Boolean do_product,Boolean do_feat_id)2124 NLM_EXTERN void MakeFastaStreamIdSuffix (
2125 SeqFeatPtr sfp,
2126 Uint4 idx,
2127 CharPtr prefix,
2128 CharPtr buf,
2129 Boolean do_product,
2130 Boolean do_feat_id
2131 )
2132
2133 {
2134 Char fbuf [64];
2135 BIG_ID gi;
2136 BioseqPtr pbsp;
2137 Char pbuf [64];
2138 SeqIdPtr sip;
2139
2140 if (sfp == NULL || buf == NULL) return;
2141
2142 StringCpy (buf, prefix);
2143 fbuf [0] = '\0';
2144 pbuf [0] = '\0';
2145 if (do_product && sfp->product != NULL) {
2146 pbsp = BioseqFindFromSeqLoc (sfp->product);
2147 if (pbsp != NULL) {
2148 SeqIdWrite (pbsp->id, pbuf, PRINTID_TEXTID_ACC_VER, sizeof (pbuf) - 1);
2149 } else {
2150 sip = SeqLocId (sfp->product);
2151 if (sip != NULL && sip->choice == SEQID_GI) {
2152 gi = sip->data.intvalue;
2153 sip = GetSeqIdForGI (gi);
2154 if (sip != NULL) {
2155 SeqIdWrite (sip, pbuf, PRINTID_TEXTID_ACC_VER, sizeof (pbuf) - 1);
2156 }
2157 }
2158 }
2159 }
2160 if (StringDoesHaveText (pbuf)) {
2161 StringCat (buf, "_");
2162 StringCat (buf, pbuf);
2163 }
2164 if (do_feat_id && idx > 0) {
2165 sprintf (fbuf, "%ld", (long) idx);
2166 StringCat (buf, "_");
2167 StringCat (buf, fbuf);
2168 }
2169 }
2170
2171 /*****************************************************************************
2172 *
2173 * Here are functions that convert FASTA format from file or from memory
2174 *
2175 *****************************************************************************/
2176 /********* DEFINES *********/
2177 #define FTSE_BUFF_CHUNK 4096
2178 #define BIOSEQ 1
2179 /********* INTERNAL FUNCTIONS *********/
2180 NLM_EXTERN SeqEntryPtr FastaToSeqEntryInternalEx
2181 (
2182 VoidPtr input, /* input pointer (file or memory) */
2183 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2184 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2185 Boolean is_na, /* type of sequence */
2186 CharPtr PNTR errormsg, /* error messge for debugging */
2187 Boolean parseSeqId, /* Parse SeqID from def line */
2188 CharPtr special_symbol, /* Returns special symbol if no SeqEntry */
2189 CharPtr prefix, /* prefix for localID if not parsable */
2190 Int2Ptr ctrptr, /* starting point for constructing unique ID */
2191 SeqLocPtr PNTR mask_ptr /* Pointer to a SeqLoc to Fill with Masking information */
2192 );
2193 NLM_EXTERN SeqEntryPtr FastaToSeqEntryInternal
2194 (
2195 VoidPtr input, /* input pointer (file or memory) */
2196 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2197 CharPtr PNTR last_char, /* returned pointer to next FASTA sequence */
2198 Boolean is_na, /* type of sequence */
2199 CharPtr PNTR errormsg, /* error messge for debugging */
2200 Boolean parseSeqId, /* Parse SeqID from def line */
2201 CharPtr special_symbol /* Returns special symbol if no SeqEntry */
2202 );
2203 static Boolean FastaReadSequenceInternal
2204 (
2205 VoidPtr input, /* input pointer (file or memory) */
2206 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2207 CharPtr PNTR last_char, /* returned pointer to next FASTA sequence */
2208 Boolean is_na, /* type of sequence */
2209 Int4Ptr seq_length, /* Returned length of sequence in residues */
2210 ByteStorePtr PNTR, /* Returned pointer to sequence ByteStore */
2211 CharPtr PNTR errormsg, /* error messge for debugging */
2212 CharPtr special_symbol /* Returns special symbol if no SeqEntry */
2213 );
2214 static Boolean FastaReadSequenceInternalEx
2215 (
2216 VoidPtr input, /* input pointer (file or memory) */
2217 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2218 CharPtr PNTR last_char, /* returned pointer to next FASTA sequence */
2219 Boolean is_na, /* type of sequence */
2220 Int4Ptr seq_length, /* Returned length of sequence in residues */
2221 ByteStorePtr PNTR, /* Returned pointer to sequence ByteStore */
2222 CharPtr PNTR errormsg, /* error messge for debugging */
2223 CharPtr special_symbol, /* Returns special symbol if no SeqEntry */
2224 SeqLocPtr PNTR mask_ptr,/* Pointer to a SeqLoc to Fill with Masking information */
2225 SeqIdPtr sip /* SeqId of current sequence used for Masking Info */
2226 );
2227 static Int4 FastaReadSequenceChunk
2228 (
2229 VoidPtr input, /* input pointer (file or memory) */
2230 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2231 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2232 Uint1Ptr sequence, /* buffer to read sequence to */
2233 Int4 length, /* size of buffer */
2234 CharPtr special_symbol /* Returns special symbol if no SeqEntry */
2235 );
2236 static SeqEntryPtr FastaToSeqEntryInternalExEx
2237 (
2238 VoidPtr input, /* input pointer (file or memory) */
2239 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2240 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2241 Boolean is_na, /* type of sequence */
2242 CharPtr PNTR errormsg, /* error messge for debugging */
2243 Boolean parseSeqId, /* Parse SeqID from def line */
2244 CharPtr special_symbol, /* Returns special symbol if no SeqEntry */
2245 CharPtr prefix, /* prefix for localID if not parsable */
2246 Int2Ptr ctrptr, /* starting point for constructing unique ID */
2247 SeqLocPtr PNTR mask_ptr, /* Pointer to a SeqLoc to Fill with Masking information */
2248 Boolean trustID
2249 );
2250 /********* FINCTIONS *********/
2251 /*****************************************************************************
2252 *
2253 * SeqEntryPFtr FastaToSeqBuffEx() - function to return SeqEntryPtr from
2254 * buffer with error handling
2255 *
2256 *****************************************************************************/
FastaToSeqBuffEx(CharPtr buffer,CharPtr PNTR last_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId)2257 NLM_EXTERN SeqEntryPtr FastaToSeqBuffEx
2258 (
2259 CharPtr buffer, /* buffer in memory with FASTA sequence */
2260 CharPtr PNTR last_char, /* here returned pointer to next FASTA if any */
2261 Boolean is_na, /* type of sequence */
2262 CharPtr PNTR errormsg, /* error message for debugging */
2263 Boolean parseSeqId /* Parse SeqID from def line */
2264 )
2265 {
2266 return FastaToSeqEntryInternal((void *)buffer, FASTA_MEM_IO ,
2267 last_char, is_na, errormsg, parseSeqId, NULL);
2268 }
FastaToSeqBuffForDb(CharPtr buffer,CharPtr PNTR last_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr prefix,Int2Ptr ctrptr,SeqLocPtr PNTR mask_ptr)2269 NLM_EXTERN SeqEntryPtr FastaToSeqBuffForDb
2270 (
2271 CharPtr buffer, /* buffer in memory with FASTA sequence */
2272 CharPtr PNTR last_char, /* here returned pointer to next FASTA if any */
2273 Boolean is_na, /* type of sequence */
2274 CharPtr PNTR errormsg, /* error message for debugging */
2275 Boolean parseSeqId, /* Parse SeqID from def line */
2276 CharPtr prefix, /* prefix for localID if not parsable */
2277 Int2Ptr ctrptr, /* starting point for constructing unique ID */
2278 SeqLocPtr PNTR mask_ptr /* Pointer to a SeqLoc to Fill with Masking information from lowercased letters */
2279 )
2280 {
2281 return FastaToSeqEntryInternalExEx((void *)buffer, FASTA_MEM_IO ,
2282 last_char, is_na, errormsg, parseSeqId,
2283 NULL, prefix, ctrptr, mask_ptr, TRUE);
2284 }
2285 /*****************************************************************************
2286 *
2287 * SeqEntryPtr FastaToSeqEntryEx() - function to return SeqEntryPtr from
2288 * file with error handling
2289 *
2290 *****************************************************************************/
FastaToSeqEntryEx(FILE * fp,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId)2291 NLM_EXTERN SeqEntryPtr FastaToSeqEntryEx
2292 (
2293 FILE *fp, /* file to get sequence from */
2294 Boolean is_na, /* type of sequence */
2295 CharPtr PNTR errormsg, /* error message for debugginq */
2296 Boolean parseSeqId /* Parse SeqID from def line */
2297 )
2298 {
2299 return FastaToSeqEntryInternal((void *)fp, FASTA_FILE_IO,
2300 NULL,is_na, errormsg, parseSeqId, NULL);
2301 }
2302 /*****************************************************************************
2303 *
2304 * SeqEntryPtr FastaToSeqEntryForDb() - function to return SeqEntryPtr from
2305 * file with error handling and with control
2306 * over generation of unique SeqIDs
2307 *
2308 *****************************************************************************/
FastaToSeqEntryForDb(FILE * fp,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr prefix,Int2Ptr ctrptr,SeqLocPtr PNTR mask_ptr)2309 NLM_EXTERN SeqEntryPtr FastaToSeqEntryForDb
2310 (
2311 FILE *fp, /* file to get sequence from */
2312 Boolean is_na, /* type of sequence */
2313 CharPtr PNTR errormsg, /* error message for debugginq */
2314 Boolean parseSeqId, /* Parse SeqID from def line */
2315 CharPtr prefix, /* prefix for localID if not parsable */
2316 Int2Ptr ctrptr, /* starting point for constructing unique ID */
2317 SeqLocPtr PNTR mask_ptr /* Pointer to a SeqLoc to Fill with Masking information from lowercased letters */
2318 )
2319 {
2320 return FastaToSeqEntryInternalExEx ((void *) fp, FASTA_FILE_IO,
2321 NULL, is_na, errormsg, parseSeqId,
2322 NULL, prefix, ctrptr, mask_ptr, TRUE);
2323 }
2324 /*****************************************************************************
2325 *
2326 * SeqEntryPtr FastaToSeqEntry() - function to return SeqEntryPtr from
2327 * file without error handling
2328 *
2329 *****************************************************************************/
FastaToSeqEntry(FILE * fp,Boolean is_na)2330 NLM_EXTERN SeqEntryPtr FastaToSeqEntry (FILE *fp, Boolean is_na)
2331 {
2332 return FastaToSeqEntryEx (fp, is_na, NULL, TRUE);
2333 }
2334 /*****************************************************************************
2335 *
2336 * SeqEntryPtr FastaToSeqBuff() - function to return SeqEntryPtr from
2337 * buffer without error handling
2338 *
2339 *****************************************************************************/
FastaToSeqBuff(CharPtr buffer,CharPtr PNTR last_char,Boolean is_na)2340 NLM_EXTERN SeqEntryPtr FastaToSeqBuff (CharPtr buffer, CharPtr PNTR last_char,
2341 Boolean is_na)
2342 {
2343 return FastaToSeqBuffEx (buffer, last_char, is_na, NULL, TRUE);
2344 }
2345 /*****************************************************************************
2346 *
2347 * Boolean FastaReadSequenceChunk() - read sequence chunkfrom
2348 * file or buffer for use in
2349 * FastaReadsequenceInternal()
2350 *****************************************************************************/
FastaReadSequenceChunk(VoidPtr input,Int4 type,CharPtr PNTR next_char,Uint1Ptr sequence,Int4 length,CharPtr special_symbol)2351 static Int4 FastaReadSequenceChunk
2352 (
2353 VoidPtr input, /* input pointer (file or memory) */
2354 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2355 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2356 Uint1Ptr sequence, /* buffer to read sequence to */
2357 Int4 length, /* size of buffer */
2358 CharPtr special_symbol /* Returns special symbol if no SeqEntry */
2359 )
2360 {
2361 const Char PNTR firstchar;
2362 FILE *fd;
2363 register Int4 i;
2364 Int2 ch = 0;
2365 /* Type of input depends upon calling function */
2366 if(type == FASTA_FILE_IO) {
2367 fd = (FILE *) input;
2368 /* Skip empty lines and lines starting with a comment symbol. */
2369 while (1) {
2370 ch = NLM_GETC(fd);
2371 /* Ignore lines starting with a comment symbol. */
2372 if (ch == '!' || ch == '#') {
2373 do {
2374 ch = NLM_GETC(fd);
2375 } while (ch != '\n' && ch != '\r' && ch != '\0' && ch != EOF);
2376 }
2377 /* If end of file reached, return 0. */
2378 if (ch == EOF)
2379 return 0;
2380 /* If line not empty, break out of this loop. */
2381 if (ch != '\n' && ch != '\r')
2382 break;
2383 }
2384 if(ch == '>' || ch == '&' || ch == '{' || ch == '}' || ch == '[' || ch == ']')
2385 {
2386 ungetc(ch, fd);
2387 if (special_symbol != NULL) {
2388 *special_symbol = (Char) ch;
2389 }
2390 return 0;
2391 }
2392 sequence[0] = (Uint1) ch;
2393 if ((fgets((CharPtr) sequence+1, length-1, fd)) == NULL) {
2394 sequence [1] = '\0';
2395 }
2396 } else { /* type == FASTA_MEM_IO */
2397 if((firstchar = (const Char PNTR) input) == NULL)
2398 return 0;
2399 }
2400 if(type == FASTA_FILE_IO) {
2401 for(i=0; i < length; i++) {
2402 if (sequence[i] == '\n' || sequence[i] == '\r' || sequence[i] == '\0')
2403 break;
2404 }
2405 } else { /* type = FASTA_MEM_IO */
2406 for(i =0; i < length && (ch = *firstchar) != NULLB; firstchar++, i++) {
2407 if((sequence[i] = (Char) ch) == '>' || (Char) ch == '&' || (Char) ch == '{' ||
2408 (Char) ch == '}' || (Char) ch == '[' || (Char) ch == ']') {
2409 if((i == 0) ||
2410 (i > 0 && (sequence[i-1] == '\n' ||
2411 sequence[i-1] == '\r'))) {
2412 if (special_symbol != NULL) {
2413 *special_symbol = (Char) ch;
2414 }
2415 break;
2416 }
2417 }
2418 }
2419 if(ch == NULLB) /* the end of buffer */
2420 *next_char = NULL;
2421 else
2422 *next_char = (CharPtr) firstchar;
2423 }
2424 return i;
2425 }
2426 /*****************************************************************************
2427 *
2428 * Boolean FastaReadSequence() - read sequence from file
2429 *
2430 *****************************************************************************/
FastaReadSequence(FILE * fd,Boolean is_na,Int4Ptr seq_length,ByteStorePtr PNTR bs_out,CharPtr PNTR errormsg)2431 Boolean FastaReadSequence
2432 (
2433 FILE *fd, /* input pointer (file or memory) */
2434 Boolean is_na, /* type of sequence */
2435 Int4Ptr seq_length, /* Returned length of sequence in residues */
2436 ByteStorePtr PNTR bs_out, /* Returned pointer to sequence ByteStore */
2437 CharPtr PNTR errormsg /* error message for debugging */
2438 )
2439 {
2440 return FastaReadSequenceInternal((VoidPtr) fd, FASTA_FILE_IO, NULL,
2441 is_na, seq_length, bs_out, errormsg, NULL);
2442 }
2443 /*****************************************************************************
2444 *
2445 * Boolean FastaReadSequenceMem() - read sequence from buffer
2446 *
2447 *****************************************************************************/
FastaReadSequenceMem(CharPtr buffer,CharPtr PNTR next_char,Boolean is_na,Int4Ptr seq_length,ByteStorePtr PNTR bs_out,CharPtr PNTR errormsg)2448 Boolean FastaReadSequenceMem
2449 (
2450 CharPtr buffer, /* input buffer with sequence */
2451 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2452 Boolean is_na, /* type of sequence */
2453 Int4Ptr seq_length, /* Returned length of sequence in residues */
2454 ByteStorePtr PNTR bs_out, /* Returned pointer to sequence ByteStore */
2455 CharPtr PNTR errormsg /* error message for debugging */
2456 )
2457 {
2458 return FastaReadSequenceInternal((VoidPtr) buffer, FASTA_MEM_IO,
2459 next_char, is_na, seq_length, bs_out,
2460 errormsg, NULL);
2461 }
2462 /*****************************************************************************
2463 *
2464 * Boolean FastaReadSequenceInternal() - read sequence from
2465 * file or buffer for internal use
2466 *
2467 *****************************************************************************/
FastaReadSequenceInternal(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,Int4Ptr seq_length,ByteStorePtr PNTR bs_out,CharPtr PNTR errormsg,CharPtr special_symbol)2468 static Boolean FastaReadSequenceInternal
2469 (
2470 VoidPtr input, /* input pointer (file or memory) */
2471 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2472 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2473 Boolean is_na, /* type of sequence */
2474 Int4Ptr seq_length, /* Returned length of sequence in residues */
2475 ByteStorePtr PNTR bs_out, /* Returned pointer to sequence ByteStore */
2476 CharPtr PNTR errormsg, /* error message for debugging */
2477 CharPtr special_symbol /* Returns special symbol if no SeqEntry */
2478 )
2479 {
2480 return FastaReadSequenceInternalEx(input,type,next_char,is_na,seq_length,bs_out,errormsg,special_symbol,NULL, NULL);
2481 }
2482 /*****************************************************************************
2483 *
2484 * Boolean FastaReadSequenceInternalEx() - read sequence from
2485 * file or buffer for internal use
2486 * and Create Masked SeqLoc of Lowercase sequences.
2487 *
2488 *****************************************************************************/
FastaReadSequenceInternalEx(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,Int4Ptr seq_length,ByteStorePtr PNTR bs_out,CharPtr PNTR errormsg,CharPtr special_symbol,SeqLocPtr PNTR mask_ptr,SeqIdPtr sip)2489 static Boolean FastaReadSequenceInternalEx
2490 (
2491 VoidPtr input, /* input pointer (file or memory) */
2492 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2493 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2494 Boolean is_na, /* type of sequence */
2495 Int4Ptr seq_length, /* Returned length of sequence in residues */
2496 ByteStorePtr PNTR bs_out, /* Returned pointer to sequence ByteStore */
2497 CharPtr PNTR errormsg, /* error message for debugging */
2498 CharPtr special_symbol, /* Returns special symbol if no SeqEntry */
2499 SeqLocPtr PNTR mask_ptr, /* Pointer to a SeqLoc to Fill with Masking information */
2500 SeqIdPtr sip /* SeqId of current sequence used for Masking Info */
2501 )
2502 {
2503 SeqMapTablePtr smtp;
2504 Uint1Ptr in_buff, out_buff;
2505 CharPtr ptr, chptr;
2506 Int2 ch;
2507 Uint1 byte_from, uch;
2508 register Int4 i;
2509 CharPtr badchar = NULL;
2510 Int4 in_index, out_index, total_read, badchars = 0;
2511 Int4 total_length = 0;
2512 Int4 mask_to;
2513 Char tmp[32];
2514 ValNodePtr mask_head,mask,mask_new;
2515 SeqIntPtr mask_sint;
2516 Boolean Second, skip_to_eol, last_was_star;
2517 Boolean this_char_masked;
2518 if (input == NULL) /* empty input */
2519 return FALSE;
2520 /* Initializing conversion tables */
2521 if(is_na) {
2522 if((smtp = SeqMapTableFind(Seq_code_ncbi4na,
2523 Seq_code_iupacna)) == NULL) {
2524 return FALSE;
2525 }
2526 } else {
2527 if((smtp = SeqMapTableFind(Seq_code_ncbistdaa,
2528 Seq_code_ncbieaa)) == NULL) {
2529 return FALSE;
2530 }
2531 }
2532 /* Allocationg error message buffers if required */
2533 if (errormsg != NULL) {
2534 *errormsg = NULL;
2535 if((badchar = (CharPtr) MemNew(256)) == NULL)
2536 return FALSE;
2537 }
2538 if((in_buff = (Uint1Ptr) MemNew(FTSE_BUFF_CHUNK)) == NULL)
2539 return FALSE;
2540 if((out_buff = (Uint1Ptr) MemNew(FTSE_BUFF_CHUNK)) == NULL)
2541 return FALSE;
2542 if((*bs_out = BSNew(FTSE_BUFF_CHUNK)) == NULL)
2543 return FALSE;
2544 Second = FALSE;
2545 skip_to_eol = FALSE;
2546 last_was_star = FALSE;
2547 in_index = out_index = total_read = 0;
2548 if(mask_ptr) {
2549 mask_head=mask=NULL;
2550 mask_sint=NULL;
2551 this_char_masked=FALSE;
2552 }
2553 while(TRUE) {
2554 if (in_index == total_read) {
2555 if((total_read = FastaReadSequenceChunk(input, type,
2556 next_char, in_buff,
2557 FTSE_BUFF_CHUNK, special_symbol)) == 0)
2558 break; /* Here is exit from the loop */
2559 if(type == FASTA_MEM_IO)
2560 input = (VoidPtr) *next_char;
2561 in_index = 0;
2562 }
2563 byte_from = in_buff[in_index];
2564 in_index++;
2565 if ((! is_na) && (! last_was_star) && byte_from == '*') {
2566 last_was_star = TRUE;
2567 } else if(byte_from != ';' && !skip_to_eol) {
2568 if(mask_ptr) {
2569 if(IS_LOWER(byte_from)) {
2570 if(this_char_masked) {
2571 mask_to++;
2572 } else { /* First lowercase character in this segment */
2573 this_char_masked = TRUE;
2574 /* save previous segment if any */
2575 mask_new = ValNodeNew(NULL);
2576 mask_new->choice = SEQLOC_INT;
2577 if(mask_sint) {
2578 mask_sint->to = mask_to;
2579 mask->next = mask_new;
2580 } else {
2581 mask_head = mask_new;
2582 }
2583 mask = mask_new;
2584 mask_sint = SeqIntNew();
2585 mask_sint->from = total_length;
2586 mask_sint->to = total_length;
2587 mask_to = total_length;
2588 mask_sint->strand = Seq_strand_both;
2589 mask_sint->id = SeqIdDup(sip);
2590 mask_new->data.ptrvalue = mask_sint;
2591 }
2592 } else {
2593 this_char_masked = FALSE;
2594 }
2595 }
2596 byte_from = TO_UPPER (byte_from);
2597 if (is_na && byte_from == 'U') byte_from = 'T';
2598 if (is_na && byte_from == 'X') byte_from = 'N';
2599 if((uch = SeqMapTableConvert(smtp, byte_from)) !=
2600 INVALID_RESIDUE && byte_from != '-') {
2601 if (last_was_star) {
2602 total_length++;
2603 out_buff[out_index] = SeqMapTableConvert(smtp, '*');
2604 out_index++;
2605 if(out_index == FTSE_BUFF_CHUNK) {
2606 if(BSWrite(*bs_out, out_buff, out_index) != out_index) {
2607 MemFree(badchar);
2608 MemFree(in_buff);
2609 MemFree(out_buff);
2610 return FALSE;
2611 }
2612 out_index = 0;
2613 }
2614 last_was_star = FALSE;
2615 }
2616 total_length++;
2617 if(is_na) {
2618 if(!Second) {
2619 uch <<= 4;
2620 out_buff[out_index] = uch;
2621 } else {
2622 out_buff[out_index] += uch;
2623 out_index++;
2624 }
2625 Second = !Second;
2626 } else {
2627 out_buff[out_index] = uch;
2628 out_index++;
2629 }
2630 } else if (errormsg != NULL){
2631 if(IS_ALPHA(byte_from) || byte_from == '-' || byte_from == '?') {
2632 (badchar [(int) (byte_from)])++;
2633 badchars++;
2634 }
2635 }
2636 } else { /* ch == ';' */
2637 /* We have to ignore rest of the line */
2638 skip_to_eol = TRUE;
2639 while(in_index < total_read &&
2640 (byte_from = in_buff[in_index]) != '\n' &&
2641 byte_from != '\r')
2642 in_index++;
2643 /* Do not skip other passes if a line-return has
2644 been encountered as shown by examining less than the total
2645 (for FASTA_MEM_IO) or finding a line-return (FASTA_FILE_IO). */
2646 if(in_index < total_read ||
2647 (in_index < FTSE_BUFF_CHUNK &&
2648 (in_buff[in_index] == '\n' || in_buff[in_index] == '\r')))
2649 skip_to_eol = FALSE;
2650 }
2651 if(out_index == FTSE_BUFF_CHUNK) {
2652 if(BSWrite(*bs_out, out_buff, out_index) != out_index) {
2653 MemFree (badchar);
2654 MemFree(in_buff);
2655 MemFree(out_buff);
2656 return FALSE;
2657 }
2658 out_index = 0;
2659 }
2660 } /* while (TRUE) */
2661 /* We have to write remaining stuff in out_buff */
2662 if(is_na && Second) out_index++; /* Partial byte for DNA */
2663 if(BSWrite(*bs_out, out_buff, out_index) != out_index) {
2664 MemFree (badchar);
2665 MemFree(in_buff);
2666 MemFree(out_buff);
2667 return FALSE;
2668 }
2669 *seq_length = total_length;
2670 /* If required bad characters statistics */
2671 if (errormsg != NULL && badchars > 0) {
2672 if((ptr = (CharPtr) MemNew (sizeof(Char)*512)) == NULL)
2673 return FALSE;
2674 chptr = "";
2675 sprintf (ptr, "%ld illegal %s %s removed:\n", (long) badchars,
2676 badchars == 1 ? "character" : "characters",
2677 badchars == 1 ? "was" : "were"
2678 );
2679 for (ch = 'A', i =0; ch <= 'Z'; ch++, i++) {
2680 if ((badchar[ch]) > 0) {
2681 sprintf (tmp, "%s%d %c%s",
2682 chptr, (int) badchar[ch], ch,
2683 badchar[ch] == 1 ? "" : "s");
2684 StringCat (ptr, tmp);
2685 chptr = ", ";
2686 }
2687 }
2688 ch = '-';
2689 if ((badchar[ch]) > 0) {
2690 sprintf (tmp, "%s%d %c%s",
2691 chptr, badchar[ch], ch,
2692 badchar[ch] == 1 ? "" : "s");
2693 StringCat (ptr, tmp);
2694 chptr = ", ";
2695 }
2696 ch = '?';
2697 if ((badchar[ch]) > 0) {
2698 sprintf (tmp, "%s%d %c%s",
2699 chptr, badchar[ch], ch,
2700 badchar[ch] == 1 ? "" : "s");
2701 StringCat (ptr, tmp);
2702 chptr = ", ";
2703 }
2704 *errormsg = StringSave (ptr);
2705 MemFree (ptr);
2706 }
2707 MemFree (badchar);
2708 MemFree(in_buff);
2709 MemFree(out_buff);
2710 if(mask_ptr && mask_head) {
2711 SeqLocPtr slp;
2712 if(mask_sint) {
2713 mask_sint->to = mask_to;
2714 mask->next = NULL;
2715 }
2716 slp = SeqLocNew(NULL);
2717 slp->choice = SEQLOC_PACKED_INT;
2718 slp->data.ptrvalue = mask_head;
2719 *mask_ptr = slp;
2720 }
2721 return TRUE;
2722 }
MakeTrustedID(CharPtr prefix,Int2Ptr ctrptr)2723 static SeqIdPtr MakeTrustedID (CharPtr prefix, Int2Ptr ctrptr)
2724 {
2725 Char buf[128];
2726 ValNodePtr newid;
2727 ObjectIdPtr oid;
2728 Int2 start = 1;
2729 if (ctrptr != NULL) {
2730 start = *ctrptr;
2731 }
2732 if (start < 1) {
2733 start = 1;
2734 }
2735 if (prefix)
2736 sprintf (buf, "%d_%.32s", (int) start, prefix);
2737 else
2738 sprintf(buf, "%d", (int) start);
2739 newid = ValNodeNew (NULL);
2740 oid = ObjectIdNew ();
2741 newid->choice = SEQID_LOCAL;
2742 newid->data.ptrvalue = oid;
2743 oid->str = StringSave (buf);
2744 if (ctrptr != NULL) {
2745 *ctrptr = start + 1;
2746 }
2747 return newid;
2748 }
FastaToSeqEntryInternalExEx(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr special_symbol,CharPtr prefix,Int2Ptr ctrptr,SeqLocPtr PNTR mask_ptr,Boolean trustID)2749 static SeqEntryPtr FastaToSeqEntryInternalExEx
2750 (
2751 VoidPtr input, /* input pointer (file or memory) */
2752 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2753 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2754 Boolean is_na, /* type of sequence */
2755 CharPtr PNTR errormsg, /* error messge for debugging */
2756 Boolean parseSeqId, /* Parse SeqID from def line */
2757 CharPtr special_symbol, /* Returns special symbol if no SeqEntry */
2758 CharPtr prefix, /* prefix for localID if not parsable */
2759 Int2Ptr ctrptr, /* starting point for constructing unique ID */
2760 SeqLocPtr PNTR mask_ptr, /* Pointer to a SeqLoc to Fill with Masking information */
2761 Boolean trustID
2762 )
2763 {
2764 SeqEntryPtr sep = NULL;
2765 BioseqPtr bsp = NULL;
2766 ValNodePtr vnp = NULL;
2767 Int2 ch;
2768 CharPtr chptr = NULL, ptr = NULL;
2769 register Int4 i;
2770 CharPtr defline, buffer= NULL; /* Working buffers */
2771 Int4 BuffSize = FTSE_BUFF_CHUNK;
2772 long len = 0;
2773 FILE *fd;
2774 const Char PNTR firstchar;
2775 Boolean is_gap = FALSE;
2776 if (special_symbol != NULL) {
2777 *special_symbol = '\0';
2778 }
2779 if (input == NULL) /* empty input */
2780 return NULL;
2781 /* Type of input depends upon calling function */
2782 if(type == FASTA_FILE_IO)
2783 fd = (FILE *) input;
2784 else /* type == FASTA_MEM_IO */
2785 firstchar = (const Char PNTR) input;
2786 /* Rolling spaces to check first non-space character */
2787 if(type == FASTA_FILE_IO) {
2788 do {
2789 ch = NLM_GETC(fd);
2790 if (ch == '!' || ch == '#') { /* comment symbol - ignore rest of line */
2791 do {
2792 ch = NLM_GETC(fd);
2793 } while (ch != '\n' && ch != '\r' && ch != '\0' && ch != EOF);
2794 }
2795 } while (IS_WHITESP(ch));
2796 } else { /* if(type == FASTA_MEM_IO*/
2797 while (IS_WHITESP(ch = *firstchar)) /* Rolling spaces */
2798 firstchar++;
2799 }
2800 if(ch == EOF || ch == NULLB || ch == '&' || ch == '{' ||
2801 ch == '}' || ch == '[' || ch == ']') {
2802 /* This is empty FILE or buffer or special symbol detected */
2803 if (special_symbol != NULL) {
2804 *special_symbol = ch;
2805 }
2806 return NULL;
2807 }
2808 /* First character is valid: initializing main structures */
2809 /* Initializing Seq-entry structure */
2810 if((sep = SeqEntryNew()) == NULL) {
2811 MemFree(buffer);
2812 return NULL;
2813 }
2814 sep->choice = BIOSEQ; /* == 1 */
2815 if((bsp = BioseqNew()) == NULL) {
2816 MemFree(buffer);
2817 return NULL;
2818 }
2819 sep->data.ptrvalue = bsp;
2820 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer)bsp, sep);
2821 if (is_na) {
2822 bsp->mol = Seq_mol_na;
2823 bsp->seq_data_type = Seq_code_ncbi4na;
2824 } else {
2825 bsp->mol = Seq_mol_aa;
2826 bsp->seq_data_type = Seq_code_ncbistdaa;
2827 }
2828 bsp->repr = Seq_repr_raw;
2829 /* ------------- */
2830 /* Now reading defline into memory */
2831 /* DEFLINE PROCCESSING*/
2832 if(ch == '>') { /* Defline is present - processing */
2833 if((buffer = (CharPtr) MemNew(BuffSize+1)) == NULL)
2834 return NULL;
2835 if(type == FASTA_FILE_IO) { /* File */
2836 buffer[0] = (Char) ch;
2837 i = 0;
2838 fgets(buffer+1, BuffSize, fd);
2839 while (1)
2840 {
2841 while (i<BuffSize-1)
2842 {
2843 if(buffer[i] == '\n' || buffer[i] == '\r' || buffer[i] == NULLB)
2844 {
2845 buffer[i] = NULLB;
2846 break;
2847 }
2848 i++;
2849 }
2850 if (i == BuffSize-1 && (buffer[i] == '\n' || buffer[i] == '\r'))
2851 {
2852 buffer[i] = NULLB;
2853 }
2854 if (buffer[i] == NULLB)
2855 break;
2856 BuffSize = i + FTSE_BUFF_CHUNK;
2857 if((buffer = (CharPtr)Realloc(buffer, BuffSize+1)) == NULL)
2858 {
2859 ErrLogPrintf("Error re-allocating memory in FastaToSeqEntry");
2860 MemFree(buffer);
2861 return NULL;
2862 }
2863 fgets(buffer+i+1, FTSE_BUFF_CHUNK, fd);
2864 }
2865 } else { /* type = FASTA_MEM_IO */
2866 for(i =0; (ch = *firstchar) != NULLB; firstchar++, i++) {
2867 if (i >= BuffSize) {
2868 BuffSize = i + FTSE_BUFF_CHUNK;
2869 buffer = (CharPtr) Realloc(buffer, BuffSize);
2870 }
2871 if((buffer[i] = (Char) ch) == '\n' || ch == '\r') {
2872 break;
2873 }
2874 }
2875 buffer[i] = NULLB;
2876 if(ch == NULLB) {/* the end of buffer */
2877 *next_char = NULL;
2878 input = (VoidPtr) "\0";
2879 } else {
2880 *next_char = (CharPtr) firstchar;
2881 input = (VoidPtr) firstchar;
2882 }
2883 }
2884 defline = buffer+1; /* Character after '>' */
2885 if(defline[0] != '?') {
2886 /* Creating standard Seq-id */
2887 ptr = defline;
2888 while (IS_WHITESP(*ptr))
2889 ptr++;
2890 if (parseSeqId) {
2891 if (*ptr == '"') {
2892 ptr++;
2893 chptr = StringChr (ptr, '"');
2894 } else {
2895 for (chptr = ptr; *chptr != NULLB && !IS_WHITESP(*chptr);
2896 chptr++) continue;
2897 if (*chptr == NULLB)
2898 chptr = NULL;
2899 }
2900 }
2901 if (!parseSeqId) {
2902 chptr = ptr;
2903 } else if (chptr != NULL) {
2904 *chptr = NULLB;
2905 chptr++;
2906 bsp->id = MakeSeqID (ptr);
2907 } else if (*ptr != NULLB) {
2908 bsp->id = MakeSeqID (ptr);
2909 }
2910 if (bsp->id == NULL) {
2911 if (trustID) {
2912 bsp->id = MakeTrustedID (prefix, ctrptr);
2913 } else {
2914 bsp->id = MakeNewProteinSeqIdExMT (NULL, NULL, prefix, ctrptr, TRUE);
2915 }
2916 }
2917 if (chptr != NULL) {
2918 if((vnp = SeqDescrNew(NULL)) != NULL) {
2919 vnp->choice = Seq_descr_title;
2920 while (IS_WHITESP(*chptr))
2921 chptr++;
2922 vnp->data.ptrvalue = StringSave (chptr);
2923 }
2924 bsp->descr = vnp;
2925 }
2926 } else {
2927 /* Unknown Seq-id */
2928 ptr = defline + 1;
2929 while (IS_WHITESP(*ptr))
2930 ptr++;
2931 if (StringNCmp (ptr, "unk100", 6) == 0) {
2932 bsp->id = MakeSeqID ("lcl|unk100");
2933 ptr += 3;
2934 } else {
2935 bsp->id = MakeSeqID ("lcl|gap");
2936 }
2937 bsp->repr = Seq_repr_virtual;
2938 if(*ptr != '\0' && sscanf(ptr, "%ld", &len) == 1 && len > 0) {
2939 bsp->length = (Int4) len;
2940 } else {
2941 bsp->length = -1;
2942 }
2943 is_gap = TRUE;
2944 }
2945 MemFree(buffer);
2946 } else { /* if ch == '>' EMPTY DEFLINE */
2947 /* Defline is upsent - creating default defline */
2948 if (trustID) {
2949 bsp->id = MakeTrustedID (prefix, ctrptr);
2950 } else {
2951 bsp->id = MakeNewProteinSeqIdExMT (NULL, NULL, prefix, ctrptr, TRUE);
2952 }
2953 if(type == FASTA_FILE_IO)
2954 ungetc(ch, fd);
2955 }
2956 SeqMgrAddToBioseqIndex (bsp);
2957 /* OK, now processing sequence */
2958 if (! is_gap) {
2959 if(!FastaReadSequenceInternalEx(input, type, next_char, is_na,
2960 &bsp->length,
2961 (ByteStorePtr PNTR) &(bsp->seq_data),
2962 errormsg, special_symbol,
2963 mask_ptr, bsp->id)) {
2964 ErrPostEx(SEV_FATAL, 0, 0, "Failure to read sequence. "
2965 "FastaToSeqEntry() failed.\n");
2966 return NULL;
2967 }
2968 }
2969 BioseqPack(bsp); /* Trying to pack Bioseq more */
2970 return sep;
2971 }
FastaToSeqEntryInternalEx(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr special_symbol,CharPtr prefix,Int2Ptr ctrptr,SeqLocPtr PNTR mask_ptr)2972 NLM_EXTERN SeqEntryPtr FastaToSeqEntryInternalEx
2973 (
2974 VoidPtr input, /* input pointer (file or memory) */
2975 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2976 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2977 Boolean is_na, /* type of sequence */
2978 CharPtr PNTR errormsg, /* error messge for debugging */
2979 Boolean parseSeqId, /* Parse SeqID from def line */
2980 CharPtr special_symbol, /* Returns special symbol if no SeqEntry */
2981 CharPtr prefix, /* prefix for localID if not parsable */
2982 Int2Ptr ctrptr, /* starting point for constructing unique ID */
2983 SeqLocPtr PNTR mask_ptr /* Pointer to a SeqLoc to Fill with Masking information */
2984 )
2985 {
2986 return FastaToSeqEntryInternalExEx (input, type, next_char, is_na, errormsg,
2987 parseSeqId, special_symbol, prefix, ctrptr,
2988 mask_ptr, FALSE);
2989 }
FastaToSeqEntryInternal(VoidPtr input,Int4 type,CharPtr PNTR next_char,Boolean is_na,CharPtr PNTR errormsg,Boolean parseSeqId,CharPtr special_symbol)2990 NLM_EXTERN SeqEntryPtr FastaToSeqEntryInternal
2991 (
2992 VoidPtr input, /* input pointer (file or memory) */
2993 Int4 type, /* type of inquiry FASTA_MEM_IO or FASTA_FILE_IO */
2994 CharPtr PNTR next_char, /* returned pointer to next FASTA sequence */
2995 Boolean is_na, /* type of sequence */
2996 CharPtr PNTR errormsg, /* error messge for debugging */
2997 Boolean parseSeqId, /* Parse SeqID from def line */
2998 CharPtr special_symbol /* Returns special symbol if no SeqEntry */
2999 )
3000 {
3001 return FastaToSeqEntryInternalEx (input, type, next_char, is_na, errormsg,
3002 parseSeqId, special_symbol, NULL, NULL,NULL);
3003 }
3004 /*****************************************************************************
3005 *
3006 * FastaId(bsp, buf, buflen)
3007 * Makes the string for the id part of fasta format.
3008 * buf should be at least 40 bytes
3009 *
3010 *****************************************************************************/
FastaId(BioseqPtr bsp,CharPtr buf,Uint4 buflen)3011 NLM_EXTERN Boolean FastaId(BioseqPtr bsp, CharPtr buf, Uint4 buflen)
3012 {
3013 if ((bsp == NULL) || (buf == NULL)) return FALSE;
3014 SeqIdWrite(bsp->id, buf, PRINTID_FASTA_LONG, buflen);
3015 return TRUE;
3016 }
3017
FastaIdX(BioseqPtr bsp,CharPtr buf,Uint4 buflen,Boolean printid_general,SeqLocPtr seqloc)3018 static Boolean FastaIdX(BioseqPtr bsp, CharPtr buf, Uint4 buflen, Boolean printid_general, SeqLocPtr seqloc)
3019 {
3020 Int4 length;
3021 if ((bsp == NULL) || (buf == NULL)) return FALSE;
3022 if (seqloc == NULL || SeqLocLen(seqloc) == bsp->length)
3023 { /* Full sequence is being dumped. */
3024 if (printid_general) {
3025 SeqIdWrite(bsp->id, buf, PRINTID_FASTA_GENERAL, buflen);
3026 } else {
3027 SeqIdWrite(bsp->id, buf, PRINTID_FASTA_LONG, buflen);
3028 }
3029 }
3030 else
3031 {
3032 SeqIdWrite(bsp->id, buf, PRINTID_FASTA_SHORT, buflen);
3033 length = StringLen(buf);
3034 sprintf(buf+length, ":%ld-%ld", (long) (SeqLocStart(seqloc)+1), (long) (SeqLocStop(seqloc)+1));
3035 }
3036 return TRUE;
3037 }
3038
FastaGetOriginalId(BioseqPtr bsp)3039 NLM_EXTERN CharPtr FastaGetOriginalId (BioseqPtr bsp)
3040
3041 {
3042 CharPtr id;
3043 ObjectIdPtr oip;
3044 SeqDescrPtr sdp;
3045 UserFieldPtr ufp;
3046 UserObjectPtr uop;
3047
3048 if (bsp == NULL) return NULL;
3049
3050 for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
3051 if (sdp->choice != Seq_descr_user) continue;
3052 uop = (UserObjectPtr) sdp->data.ptrvalue;
3053 if (uop == NULL) continue;
3054 oip = uop->type;
3055 if (oip == NULL) continue;
3056 if (StringCmp (oip->str, "OrginalID") != 0 && StringCmp (oip->str, "OriginalID") != 0) continue;
3057 for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
3058 oip = ufp->label;
3059 if (oip == NULL) continue;
3060 if (StringCmp (oip->str, "LocalId") != 0) continue;
3061 if (ufp->choice != 1) continue;
3062 id = (CharPtr) ufp->data.ptrvalue;
3063 if (id == NULL) continue;
3064 return id;
3065 }
3066 }
3067
3068 return NULL;
3069 }
3070
ShouldUseOriginalID(BioseqPtr bsp)3071 NLM_EXTERN Boolean ShouldUseOriginalID (BioseqPtr bsp)
3072
3073 {
3074 DbtagPtr dbt;
3075 SeqIdPtr sip;
3076
3077 if (bsp == NULL) return FALSE;
3078
3079 for (sip = bsp->id; sip != NULL; sip = sip->next) {
3080 switch (sip->choice) {
3081 case SEQID_LOCAL :
3082 break;
3083 case SEQID_GENERAL :
3084 dbt = (DbtagPtr) sip->data.ptrvalue;
3085 if (dbt != NULL) {
3086 if (! IsSkippableDbtag (dbt)) return FALSE;
3087 }
3088 break;
3089 default :
3090 return FALSE;
3091 }
3092 }
3093
3094 return TRUE;
3095 }
3096
FastaIdEx(BioseqPtr bsp,CharPtr buf,Uint4 buflen,Boolean prefer_original_ID)3097 NLM_EXTERN Boolean FastaIdEx(BioseqPtr bsp, CharPtr buf, Uint4 buflen, Boolean prefer_original_ID)
3098 {
3099 CharPtr id;
3100 SeqIdPtr sip;
3101
3102 if ((bsp == NULL) || (buf == NULL)) return FALSE;
3103 if (prefer_original_ID) {
3104 sip = bsp->id;
3105 if (ShouldUseOriginalID (bsp)) {
3106 id = FastaGetOriginalId (bsp);
3107 if (id != NULL && StringLen (id) + 5 < buflen) {
3108 sprintf (buf, "lcl|%s", id);
3109 return TRUE;
3110 }
3111 }
3112 }
3113 SeqIdWrite(bsp->id, buf, PRINTID_FASTA_LONG, buflen);
3114 return TRUE;
3115 }
3116
3117 /*****************************************************************************
3118 *
3119 * FastaDefLine(bsp, buf, buflen, accession, organism)
3120 * Finds or makes a FASTA format defline (just locates the string)
3121 * buf should be very long if possible
3122 * function truncates if buf not long enough
3123 * a few deflines are longer than 255
3124 *
3125 *****************************************************************************/
FastaDefLine(BioseqPtr bsp,CharPtr buf,Uint4 buflen,CharPtr accession,CharPtr organism,Uint1 tech)3126 NLM_EXTERN Boolean FastaDefLine (BioseqPtr bsp, CharPtr buf, Uint4 buflen,
3127 CharPtr accession, CharPtr organism, Uint1 tech)
3128 {
3129 BioseqContextPtr bcp;
3130 ValNodePtr vnp;
3131 CharPtr tmp;
3132 PdbBlockPtr pbp;
3133 PatentSeqIdPtr psip;
3134 Uint4 diff, phase;
3135 Int4 num_segs, num_gaps;
3136 Char tbuf[128];
3137 static CharPtr htgs[2] = {
3138 "unordered", "ordered" };
3139 if ((bsp == NULL) || (buf == NULL)) return FALSE;
3140 buflen--;
3141 buf[buflen] = '\0';
3142 if (accession != NULL)
3143 {
3144 diff = LabelCopyExtra(buf, accession, buflen, "(", ") ");
3145 buflen -= diff;
3146 buf += diff;
3147 }
3148 bcp = BioseqContextNew(bsp);
3149 diff = 0;
3150 if ((tmp = BioseqContextGetTitle(bcp)) != NULL) {
3151 diff = LabelCopy(buf, tmp, buflen);
3152 /* remove trailing blanks and periods */
3153 tmp = buf + diff - 1; /* point at last character */
3154 while (((*tmp <= ' ') || (*tmp == '.')) && (diff))
3155 {
3156 *tmp = '\0';
3157 tmp--; diff--;
3158 }
3159 }
3160 else
3161 if ((vnp = BioseqContextGetSeqDescr(bcp, Seq_descr_pdb, NULL, NULL)) != NULL)
3162 {
3163 pbp = (PdbBlockPtr)(vnp->data.ptrvalue);
3164 diff = LabelCopy(buf, (CharPtr)(pbp->compound->data.ptrvalue), buflen);
3165 }
3166 else
3167 {
3168 for (vnp = bsp->id; vnp != NULL; vnp = vnp->next)
3169 {
3170 if (vnp->choice == SEQID_PATENT)
3171 {
3172 psip = (PatentSeqIdPtr)(vnp->data.ptrvalue);
3173 sprintf(tbuf, "Sequence %d from Patent %s %s",
3174 (int)psip->seqid, psip->cit->country, psip->cit->number);
3175 diff = LabelCopy(buf, tbuf, buflen);
3176 break;
3177 }
3178 }
3179 if (vnp == NULL)
3180 diff = LabelCopy(buf, "No definition line found", buflen);
3181 }
3182 buflen -= diff;
3183 buf += diff;
3184 BioseqContextFree(bcp);
3185 if (((tech >= MI_TECH_htgs_1) && (tech <= MI_TECH_htgs_3)) ||
3186 (tech == MI_TECH_htgs_0))
3187 {
3188 if (tech == MI_TECH_htgs_0) {
3189 phase = 0;
3190 StringMove(tbuf, ", LOW-PASS SEQUENCE SAMPLING.");
3191 }
3192 else {
3193 phase = (Int2)(tech - MI_TECH_htgs_1 + 1);
3194 if (phase != 3)
3195 StringMove(tbuf, ", WORKING DRAFT SEQUENCE");
3196 }
3197 if (phase != 3) {
3198 diff = LabelCopy(buf, tbuf, buflen);
3199 buflen -= diff;
3200 buf += diff;
3201 }
3202 if (phase == 3)
3203 {
3204 if (tmp && StringStr(tmp, "complete sequence") == NULL) {
3205 diff = LabelCopy(buf, ", complete sequence", buflen);
3206 buflen -= diff;
3207 buf += diff;
3208 }
3209 }
3210 else if ((bsp->repr == Seq_repr_delta) && (phase != 0))
3211 {
3212 if (CountGapsInDeltaSeq(bsp, &num_segs, &num_gaps, NULL, NULL, NULL, 0))
3213 {
3214 if (num_gaps > 0) {
3215 sprintf(tbuf, ", %ld %s pieces", (long)(num_gaps + 1), htgs[phase - 1]);
3216 } else {
3217 sprintf(tbuf, ", %ld %s piece", (long)(num_gaps + 1), htgs[phase - 1]);
3218 }
3219 diff = LabelCopy(buf, tbuf, buflen);
3220 buflen -= diff;
3221 buf += diff;
3222 }
3223 }
3224 }
3225 if (organism != NULL)
3226 {
3227 LabelCopyExtra(buf, organism, buflen, " [", "]");
3228 }
3229 return TRUE;
3230 }
is_pdb(BioseqPtr bsp)3231 static Boolean is_pdb(BioseqPtr bsp)
3232 {
3233 SeqIdPtr id;
3234 if (bsp ==NULL)
3235 return FALSE;
3236 for (id = bsp->id; id; id=id->next)
3237 {
3238 if (id->choice == SEQID_PDB)
3239 return TRUE;
3240 }
3241 return FALSE;
3242 }
tie_next(ValNodePtr head,ValNodePtr next)3243 static ValNodePtr tie_next(ValNodePtr head, ValNodePtr next)
3244 {
3245 ValNodePtr v;
3246 if (head == NULL) {
3247 return next;
3248 }
3249 for (v = head; v->next != NULL; v = v->next)
3250 continue;
3251 v->next = next;
3252 return head;
3253 }
get_descr_on_top(GatherContextPtr gcp)3254 static Boolean get_descr_on_top (GatherContextPtr gcp)
3255 {
3256 ValNodePtr tmp;
3257 DescrInfoPtr PNTR dspp;
3258 DescrInfoPtr dsp;
3259 ItemInfoPtr iip;
3260 dspp = (DescrInfoPtr PNTR) gcp->userdata;
3261 dsp = *dspp;
3262 switch (gcp->thistype) {
3263 case OBJ_SEQDESC:
3264 tmp = (ValNodePtr) (gcp->thisitem);
3265 if (tmp->choice == dsp->choice) {
3266 if (tmp->data.ptrvalue != NULL) {
3267 dsp->vnp = tmp;
3268 iip = (ItemInfoPtr) MemNew(sizeof(ItemInfo));
3269 if(dsp->iip != NULL)
3270 MemFree(dsp->iip);
3271 dsp->iip = iip;
3272 iip->entityID = gcp->entityID;
3273 iip->itemID = gcp->itemID;
3274 iip->itemtype = gcp->thistype;
3275 }
3276 }
3277 break;
3278 default:
3279 break;
3280 }
3281 return TRUE;
3282 }
get_descr(GatherContextPtr gcp)3283 static Boolean get_descr (GatherContextPtr gcp)
3284 {
3285 ValNodePtr tmp;
3286 DescrInfoPtr PNTR dspp;
3287 DescrInfoPtr dsp;
3288 ItemInfoPtr iip;
3289 BioseqPtr bsp;
3290 dspp = (DescrInfoPtr PNTR) gcp->userdata;
3291 dsp = *dspp;
3292 switch (gcp->thistype)
3293 {
3294 case OBJ_SEQDESC:
3295 tmp = (ValNodePtr) (gcp->thisitem);
3296 if (tmp->choice == dsp->choice) {
3297 bsp = (BioseqPtr) (gcp->parentitem);
3298 if (dsp->bsp != bsp) {
3299 break;
3300 }
3301 if (tmp->data.ptrvalue != NULL) {
3302 dsp->vnp = tmp;
3303 iip = (ItemInfoPtr) MemNew(sizeof(ItemInfo));
3304 dsp->iip = iip;
3305 iip->entityID = gcp->entityID;
3306 iip->itemID = gcp->itemID;
3307 iip->itemtype = gcp->thistype;
3308 }
3309 }
3310 break;
3311 default:
3312 break;
3313 }
3314 return TRUE;
3315 }
GetFeatProt(GatherContextPtr gcp)3316 static Boolean GetFeatProt (GatherContextPtr gcp)
3317 {
3318 ValNodePtr PNTR vnpp;
3319 ValNodePtr tmp;
3320 SeqFeatPtr sfp;
3321 vnpp = (ValNodePtr PNTR) gcp->userdata;
3322 switch (gcp->thistype)
3323 {
3324 case OBJ_SEQFEAT:
3325 sfp = (SeqFeatPtr) (gcp->thisitem);
3326 if (sfp->data.choice == SEQFEAT_PROT) {
3327 tmp = ValNodeNew(NULL);
3328 tmp->data.ptrvalue = sfp;
3329 *vnpp = tie_next(*vnpp, tmp);
3330 }
3331 break;
3332 default:
3333 break;
3334 }
3335 return TRUE;
3336 }
GetFeatCDS(GatherContextPtr gcp)3337 static Boolean GetFeatCDS (GatherContextPtr gcp)
3338 {
3339 SeqFeatPtr PNTR sfpp;
3340 SeqFeatPtr sfp;
3341 sfpp = (SeqFeatPtr PNTR) gcp->userdata;
3342 switch (gcp->thistype)
3343 {
3344 case OBJ_SEQFEAT:
3345 sfp = (SeqFeatPtr) (gcp->thisitem);
3346 if (sfp->data.choice == SEQFEAT_CDREGION) {
3347 *sfpp = sfp;
3348 return FALSE;
3349 }
3350 break;
3351 default:
3352 break;
3353 }
3354 *sfpp = NULL;
3355 return TRUE;
3356 }
GetFeatGenes(GatherContextPtr gcp)3357 static Boolean GetFeatGenes (GatherContextPtr gcp)
3358 {
3359 ValNodePtr PNTR vnpp;
3360 ValNodePtr tmp;
3361 SeqFeatPtr sfp;
3362 vnpp = (ValNodePtr PNTR) gcp->userdata;
3363 switch (gcp->thistype)
3364 {
3365 case OBJ_SEQFEAT:
3366 sfp = (SeqFeatPtr) (gcp->thisitem);
3367 if (sfp->data.choice == SEQFEAT_GENE) {
3368 tmp = ValNodeNew(NULL);
3369 tmp->data.ptrvalue = sfp;
3370 *vnpp = tie_next(*vnpp, tmp);
3371 }
3372 break;
3373 default:
3374 break;
3375 }
3376 return TRUE;
3377 }
IndexedGatherDescrOnBioseq(ItemInfoPtr iip,BioseqPtr bsp,Uint1 choice)3378 static ValNodePtr IndexedGatherDescrOnBioseq (ItemInfoPtr iip, BioseqPtr bsp, Uint1 choice)
3379 {
3380 SeqMgrDescContext dcontext;
3381 SeqDescrPtr sdp;
3382 sdp = SeqMgrGetNextDescriptor (bsp, NULL, choice, &dcontext);
3383 if (sdp == NULL) return NULL;
3384 if (ISA_aa(bsp->mol) && !is_pdb(bsp)) {
3385 if (dcontext.level != 0) return NULL;
3386 }
3387 if (iip != NULL) {
3388 iip->entityID = dcontext.entityID;
3389 iip->itemID = dcontext.itemID;
3390 iip->itemtype = OBJ_SEQDESC;
3391 }
3392 return sdp;
3393 }
GatherDescrOnBioseq(ItemInfoPtr iip,BioseqPtr bsp,Uint1 choice,Boolean get_first)3394 static ValNodePtr GatherDescrOnBioseq(ItemInfoPtr iip, BioseqPtr bsp, Uint1 choice, Boolean get_first)
3395 {
3396 ValNodePtr vnp = NULL;
3397 /*
3398 GatherScope gsc;
3399 SeqLocPtr slp;
3400 Uint2 bspID;
3401 DescrInfoPtr dsp;
3402 Uint2 entityID;
3403 */
3404 ObjValNodePtr ovp;
3405 if (ISA_aa(bsp->mol) && !is_pdb(bsp)) {
3406 vnp = BioseqGetSeqDescr (bsp, choice, NULL);
3407 } else {
3408 vnp = GetNextDescriptorUnindexed (bsp, choice, NULL);
3409 }
3410 if (vnp != NULL) {
3411 if (iip != NULL) {
3412 if (vnp->extended != 0) {
3413 ovp = (ObjValNodePtr) vnp;
3414 iip->entityID = ovp->idx.entityID;
3415 iip->itemtype = ovp->idx.itemtype;
3416 iip->itemID = ovp->idx.itemID;
3417 }
3418 }
3419 }
3420 return vnp;
3421 #if 0
3422 entityID = ObjMgrGetEntityIDForPointer (bsp);
3423 if (SeqMgrFeaturesAreIndexed (entityID)) {
3424 return IndexedGatherDescrOnBioseq (iip, bsp, choice);
3425 }
3426 /*
3427 if (iip==NULL && (get_first || (ISA_aa(bsp->mol) && !is_pdb(bsp))) ) {
3428 for(vnp=bsp->descr;vnp && vnp->choice != choice; vnp=vnp->next){}
3429 return vnp;
3430 }
3431 */
3432 if (iip==NULL && get_first)
3433 {
3434 for(vnp=bsp->descr;vnp; vnp=vnp->next)
3435 if(vnp->choice == choice)
3436 return vnp;
3437 }
3438 dsp = (DescrInfoPtr) MemNew(sizeof(DescrInfo));
3439 dsp->choice = choice;
3440 dsp->bsp = bsp;
3441 MemSet ((Pointer) (&gsc), 0, sizeof (GatherScope));
3442 MemSet ((Pointer) (gsc.ignore), (int)(TRUE), (size_t) (OBJ_MAX * sizeof(Boolean)));
3443 gsc.ignore[OBJ_SEQDESC] = FALSE;
3444 bspID = ObjMgrGetEntityIDForPointer(bsp);
3445 slp = ValNodeNew(NULL);
3446 slp->choice = SEQLOC_WHOLE;
3447 slp->data.ptrvalue = (SeqIdPtr) SeqIdDup (SeqIdFindBest (bsp->id, 0));
3448 gsc.target = slp;
3449 if (ISA_aa(bsp->mol) && !is_pdb(bsp)) {
3450 GatherEntity(bspID, &dsp, get_descr, &gsc);
3451 } else {
3452 GatherEntity(bspID, &dsp, get_descr_on_top, &gsc);
3453 }
3454 SeqLocFree(slp);
3455 vnp = dsp->vnp;
3456 if (vnp && vnp->data.ptrvalue) {
3457 if (iip != NULL) {
3458 iip->entityID = dsp->iip->entityID;
3459 iip->itemID = dsp->iip->itemID;
3460 iip->itemtype = dsp->iip->itemtype;
3461 }
3462 MemFree(dsp->iip);
3463 MemFree(dsp);
3464 return vnp;
3465 }
3466 MemFree(dsp->iip);
3467 MemFree(dsp);
3468 return NULL;
3469 #endif
3470 }
3471 /* more efficient versions of feature gather functions for protein defline */
3472 typedef struct unidxfeatdata {
3473 SeqIdPtr bspid;
3474 SeqLocPtr loc;
3475 Int4 longest;
3476 Int4 shortest;
3477 SeqFeatPtr sfp;
3478 } UndxFeatData, PNTR UndxFeatPtr;
GetLongestProtFeat(SeqFeatPtr sfp,Pointer userdata)3479 static void GetLongestProtFeat (
3480 SeqFeatPtr sfp,
3481 Pointer userdata
3482 )
3483 {
3484 Int4 len;
3485 SeqIdPtr sip;
3486 UndxFeatPtr ufp;
3487 if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT) return;
3488 ufp = (UndxFeatPtr) userdata;
3489 if (ufp == NULL) return;
3490 sip = SeqLocId (sfp->location);
3491 if (sip == NULL) return;
3492 if (! SeqIdIn (sip, ufp->bspid)) return;
3493 len = SeqLocLen (sfp->location);
3494 if (len == -1) return;
3495 if (len > ufp->longest) {
3496 ufp->sfp = sfp;
3497 ufp->longest = len;
3498 }
3499 }
GetLongestProteinUnindexed(BioseqPtr bsp)3500 static SeqFeatPtr GetLongestProteinUnindexed (
3501 BioseqPtr bsp
3502 )
3503 {
3504 BioseqSetPtr bssp = NULL;
3505 UndxFeatData ufd;
3506 if (bsp == NULL) return NULL;
3507 MemSet ((Pointer) &ufd, 0, sizeof (UndxFeatData));
3508 ufd.bspid = bsp->id;
3509 ufd.longest = 0;
3510 ufd.sfp = NULL;
3511 VisitFeaturesOnBsp (bsp, (Pointer) &ufd, GetLongestProtFeat);
3512 if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
3513 bssp = (BioseqSetPtr) bsp->idx.parentptr;
3514 }
3515 if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) {
3516 VisitFeaturesOnSet (bssp, (Pointer) &ufd, GetLongestProtFeat);
3517 if (bssp->idx.parenttype == OBJ_BIOSEQSET) {
3518 bssp = (BioseqSetPtr) bssp->idx.parentptr;
3519 }
3520 }
3521 if (bssp != NULL && bssp->_class == BioseqseqSet_class_segset) {
3522 VisitFeaturesOnSet (bssp, (Pointer) &ufd, GetLongestProtFeat);
3523 }
3524 return ufd.sfp;
3525 }
GetCDSProtFeat(SeqFeatPtr sfp,Pointer userdata)3526 static void GetCDSProtFeat (
3527 SeqFeatPtr sfp,
3528 Pointer userdata
3529 )
3530 {
3531 SeqIdPtr sip;
3532 UndxFeatPtr ufp;
3533 if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return;
3534 ufp = (UndxFeatPtr) userdata;
3535 if (ufp == NULL) return;
3536 sip = SeqLocId (sfp->product);
3537 if (sip == NULL) return;
3538 if (! SeqIdIn (sip, ufp->bspid)) return;
3539 ufp->sfp = sfp;
3540 }
GetCDSProtUnindexed(BioseqPtr bsp)3541 static SeqFeatPtr GetCDSProtUnindexed (
3542 BioseqPtr bsp
3543 )
3544 {
3545 Uint2 entityID;
3546 SeqEntryPtr sep;
3547 UndxFeatData ufd;
3548 if (bsp == NULL) return NULL;
3549 entityID = ObjMgrGetEntityIDForPointer (bsp);
3550 sep = GetTopSeqEntryForEntityID (entityID);
3551 if (sep == NULL) return NULL;
3552 MemSet ((Pointer) &ufd, 0, sizeof (UndxFeatData));
3553 ufd.bspid = bsp->id;
3554 ufd.sfp = NULL;
3555 VisitFeaturesInSep (sep, (Pointer) &ufd, GetCDSProtFeat);
3556 return ufd.sfp;
3557 }
GetBestGeneFeat(SeqFeatPtr sfp,Pointer userdata)3558 static void GetBestGeneFeat (
3559 SeqFeatPtr sfp,
3560 Pointer userdata
3561 )
3562 {
3563 Int4 diff;
3564 SeqIdPtr sip;
3565 UndxFeatPtr ufp;
3566 if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE) return;
3567 ufp = (UndxFeatPtr) userdata;
3568 if (ufp == NULL) return;
3569 sip = SeqLocId (sfp->location);
3570 if (sip == NULL) return;
3571 if (! SeqIdIn (sip, ufp->bspid)) return;
3572 diff = SeqLocAinB (ufp->loc, sfp->location);
3573 if (diff >= 0) {
3574 if (diff < ufp->shortest) {
3575 ufp->sfp = sfp;
3576 ufp->shortest = diff;
3577 }
3578 }
3579 }
GetBestGeneUnindexed(SeqLocPtr slp,Uint2 entityID)3580 static SeqFeatPtr GetBestGeneUnindexed (
3581 SeqLocPtr slp,
3582 Uint2 entityID
3583 )
3584 {
3585 BioseqPtr bsp;
3586 SeqEntryPtr sep;
3587 SeqIdPtr sip;
3588 UndxFeatData ufd;
3589 if (slp == NULL) return NULL;
3590 sip = SeqLocId (slp);
3591 if (sip == NULL) return NULL;
3592 bsp = BioseqFindCore (sip);
3593 if (bsp == NULL) return NULL;
3594 sep = GetTopSeqEntryForEntityID (entityID);
3595 if (sep == NULL) return NULL;
3596 MemSet ((Pointer) &ufd, 0, sizeof (UndxFeatData));
3597 ufd.bspid = bsp->id;
3598 ufd.loc = slp;
3599 ufd.shortest = INT4_MAX;
3600 ufd.sfp = NULL;
3601 VisitFeaturesInSep (sep, (Pointer) &ufd, GetBestGeneFeat);
3602 return ufd.sfp;
3603 }
3604 /* GatherProtCDS is still faster than GetCDSProtUnindexed for some reason */
GatherProtCDS(BioseqPtr bsp)3605 static SeqFeatPtr GatherProtCDS(BioseqPtr bsp)
3606 {
3607 GatherScope gsc;
3608 SeqLocPtr slp = NULL;
3609 Uint2 bspID;
3610 SeqFeatPtr sfp;
3611 MemSet ((Pointer) (&gsc), 0, sizeof (GatherScope));
3612 MemSet ((Pointer) (gsc.ignore), (int)(TRUE), (size_t) (OBJ_MAX * sizeof(Boolean)));
3613 gsc.ignore[OBJ_SEQFEAT] = FALSE;
3614 gsc.ignore[OBJ_SEQANNOT] = FALSE;
3615 gsc.get_feats_product = TRUE;
3616 bspID = ObjMgrGetEntityIDForPointer(bsp);
3617 slp = ValNodeNew(NULL);
3618 slp->choice = SEQLOC_WHOLE;
3619 slp->data.ptrvalue = (SeqIdPtr) SeqIdDup (SeqIdFindBest (bsp->id, 0));
3620 gsc.target = slp;
3621 sfp = NULL;
3622 GatherEntity(bspID, &sfp, GetFeatCDS, &gsc);
3623 SeqLocFree(slp);
3624 return sfp;
3625 }
3626 /* obsolete functions, replaced by Unindexed versions */
GatherSeqFeatProt(BioseqPtr bsp)3627 static SeqFeatPtr GatherSeqFeatProt(BioseqPtr bsp)
3628 {
3629 GatherScope gsc;
3630 SeqLocPtr slp = NULL;
3631 Uint2 bspID;
3632 SeqFeatPtr sfp = NULL;
3633 SeqFeatPtr f;
3634 ValNodePtr prot, v;
3635 Int4 length, longest_length=0;
3636 MemSet ((Pointer) (&gsc), 0, sizeof (GatherScope));
3637 MemSet ((Pointer) (gsc.ignore), (int)(TRUE), (size_t) (OBJ_MAX * sizeof(Boolean)));
3638 gsc.ignore[OBJ_SEQFEAT] = FALSE;
3639 gsc.ignore[OBJ_SEQANNOT] = FALSE;
3640 gsc.get_feats_location = TRUE;
3641 bspID = ObjMgrGetEntityIDForPointer(bsp);
3642 slp = ValNodeNew(NULL);
3643 slp->choice = SEQLOC_WHOLE;
3644 slp->data.ptrvalue = (SeqIdPtr) SeqIdDup (SeqIdFindBest (bsp->id, 0));
3645 gsc.target = slp;
3646 prot = NULL;
3647 GatherEntity(bspID, &prot, GetFeatProt, &gsc);
3648 for (v=prot; v; v=v->next) {
3649 f = (SeqFeatPtr) v->data.ptrvalue;
3650 if ((length=SeqLocLen(f->location)) == -1)
3651 continue;
3652 if (length > longest_length) {
3653 sfp = f;
3654 longest_length = length;
3655 }
3656 }
3657 ValNodeFree(prot);
3658 SeqLocFree(slp);
3659 return sfp;
3660 }
GatherGenesForCDS(SeqLocPtr slp)3661 static ValNodePtr GatherGenesForCDS(SeqLocPtr slp)
3662 {
3663 GatherScope gsc;
3664 Uint2 bspID;
3665 ValNodePtr vnp;
3666 BioseqPtr bsp;
3667 bsp = BioseqFindCore(SeqLocId(slp));
3668 if (bsp == NULL)
3669 return NULL;
3670 bspID = ObjMgrGetEntityIDForPointer(bsp);
3671 MemSet ((Pointer) (&gsc), 0, sizeof (GatherScope));
3672 MemSet ((Pointer) (gsc.ignore), (int)(TRUE), (size_t) (OBJ_MAX * sizeof(Boolean)));
3673 gsc.ignore[OBJ_SEQFEAT] = FALSE;
3674 gsc.ignore[OBJ_SEQANNOT] = FALSE;
3675 gsc.get_feats_location = TRUE;
3676 gsc.target = slp;
3677 vnp = NULL;
3678 GatherEntity(bspID, &vnp, GetFeatGenes, &gsc);
3679 return vnp;
3680 }
3681 typedef struct nmdef {
3682 SeqFeatPtr gene;
3683 SeqFeatPtr cds;
3684 SeqFeatPtr prot;
3685 Int4 protlen;
3686 Int2 numgenes;
3687 Int2 numcds;
3688 Int2 numprots;
3689 } NMDef, PNTR NMDefPtr;
FindNMFeats(SeqFeatPtr sfp,Pointer userdata)3690 static void FindNMFeats (SeqFeatPtr sfp, Pointer userdata)
3691 {
3692 Int4 len;
3693 NMDefPtr ndp;
3694 if (sfp == NULL) return;
3695 ndp = (NMDefPtr) userdata;
3696 if (ndp == NULL) return;
3697 switch (sfp->data.choice) {
3698 case SEQFEAT_GENE :
3699 ndp->gene = sfp;
3700 (ndp->numgenes)++;
3701 break;
3702 case SEQFEAT_CDREGION :
3703 ndp->cds = sfp;
3704 (ndp->numcds++);
3705 break;
3706 case SEQFEAT_PROT :
3707 len = SeqLocLen (sfp->location);
3708 if (len > ndp->protlen) {
3709 ndp->prot = sfp;
3710 ndp->protlen = len;
3711 (ndp->numprots)++;
3712 }
3713 break;
3714 default :
3715 break;
3716 }
3717 }
IsFlyCG(CharPtr str)3718 static Boolean IsFlyCG (CharPtr str)
3719 {
3720 Char ch;
3721 if (StringHasNoText (str)) return FALSE;
3722 ch = *str;
3723 if (ch != 'C') return FALSE;
3724 str++;
3725 ch = *str;
3726 if (ch != 'G') return FALSE;
3727 str++;
3728 ch = *str;
3729 while (IS_DIGIT (ch)) {
3730 str++;
3731 ch = *str;
3732 }
3733 if (ch != '-') return FALSE;
3734 str++;
3735 ch = *str;
3736 if (ch != 'P') return FALSE;
3737 str++;
3738 ch = *str;
3739 if (IS_ALPHA (ch)) {
3740 str++;
3741 ch = *str;
3742 if (ch == '\0' || ch == ' ' || ch == ',' || ch == ';') return TRUE;
3743 }
3744 return FALSE;
3745 }
ReplaceFlyDashPwithDashR(CharPtr str)3746 static void ReplaceFlyDashPwithDashR (CharPtr str)
3747 {
3748 Char ch;
3749 CharPtr ptr;
3750 while (StringDoesHaveText (str)) {
3751 ch = *str;
3752 while (IS_WHITESP (ch)) {
3753 str++;
3754 ch = *str;
3755 }
3756 if (IsFlyCG (str)) {
3757 ptr = StringStr (str, "-P");
3758 if (ptr != NULL) {
3759 ptr [1] = 'R';
3760 return;
3761 }
3762 }
3763 while (ch != '\0' && (! IS_WHITESP (ch))) {
3764 str++;
3765 ch = *str;
3766 }
3767 }
3768 }
FindNMDefLine(BioseqPtr bsp)3769 static CharPtr FindNMDefLine (BioseqPtr bsp)
3770 {
3771 BioSourcePtr biop;
3772 Char buf [512], buf2 [600];
3773 CharPtr cds = NULL;
3774 Uint2 entityID;
3775 CharPtr gene;
3776 Boolean is_refseq = FALSE;
3777 size_t len;
3778 NMDef nd;
3779 OrgRefPtr orp;
3780 CharPtr ptr;
3781 SeqEntryPtr sep;
3782 SeqIdPtr sip;
3783 CharPtr str;
3784 ValNodePtr vnp;
3785 MemSet ((Pointer) &nd, 0, sizeof (NMDef));
3786 entityID = ObjMgrGetEntityIDForPointer (bsp);
3787 sep = GetBestTopParentForDataEx (entityID, bsp, TRUE);
3788 VisitFeaturesInSep (sep, (Pointer) &nd, FindNMFeats);
3789 if (nd.numgenes != 1 || nd.numcds != 1 || nd.numprots < 1) return NULL;
3790 vnp = GatherDescrOnBioseq (NULL, bsp, Seq_descr_source, FALSE);
3791 if (vnp == NULL) return NULL;
3792 biop = (BioSourcePtr) vnp->data.ptrvalue;
3793 orp = biop->org;
3794 if (orp == NULL || StringHasNoText (orp->taxname)) return NULL;
3795 FeatDefLabel (nd.gene, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
3796 gene = StringSaveNoNull (buf);
3797 FeatDefLabel (nd.cds, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
3798 for (sip = bsp->id; sip != NULL; sip = sip->next) {
3799 if (sip->choice == SEQID_OTHER) {
3800 is_refseq = TRUE;
3801 }
3802 }
3803 if (is_refseq) {
3804 /* special case Drosophila RefSeq NM titles */
3805 if (StringICmp (orp->taxname, "Drosophila melanogaster") == 0) {
3806 ReplaceFlyDashPwithDashR (buf);
3807 }
3808 ptr = StringStr (buf, "isoform ");
3809 if (ptr != NULL) {
3810 *ptr = '\0';
3811 ptr += 8;
3812 StringCpy (buf2, buf);
3813 StringCat (buf2, "transcript variant ");
3814 StringCat (buf2, ptr);
3815 cds = StringSaveNoNull (buf2);
3816 } else {
3817 cds = StringSaveNoNull (buf);
3818 }
3819 } else {
3820 cds = StringSaveNoNull (buf);
3821 }
3822 len = StringLen (orp->taxname) + StringLen (cds) +
3823 StringLen (gene) + StringLen (" (), mRNA") + 10;
3824 str = (CharPtr) MemNew (len);
3825 if (str != NULL) {
3826 sprintf (str, "%s %s (%s), mRNA", orp->taxname, cds, gene);
3827 }
3828 MemFree (gene);
3829 MemFree (cds);
3830 return str;
3831 }
FindNRDefLine(BioseqPtr bsp)3832 static CharPtr FindNRDefLine (BioseqPtr bsp)
3833 {
3834 BioSourcePtr biop;
3835 Char buf [512];
3836 Uint2 entityID;
3837 CharPtr gene;
3838 size_t len;
3839 MolInfoPtr mip;
3840 NMDef nd;
3841 OrgRefPtr orp;
3842 CharPtr rna = "miscRNA";
3843 SeqEntryPtr sep;
3844 CharPtr str;
3845 ValNodePtr vnp;
3846 MemSet ((Pointer) &nd, 0, sizeof (NMDef));
3847 entityID = ObjMgrGetEntityIDForPointer (bsp);
3848 sep = GetBestTopParentForDataEx (entityID, bsp, TRUE);
3849 VisitFeaturesInSep (sep, (Pointer) &nd, FindNMFeats);
3850 if (nd.numgenes < 1) return NULL;
3851 vnp = GatherDescrOnBioseq (NULL, bsp, Seq_descr_source, FALSE);
3852 if (vnp == NULL) return NULL;
3853 biop = (BioSourcePtr) vnp->data.ptrvalue;
3854 orp = biop->org;
3855 if (orp == NULL || StringHasNoText (orp->taxname)) return NULL;
3856 FeatDefLabel (nd.gene, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
3857 gene = StringSaveNoNull (buf);
3858 vnp = GatherDescrOnBioseq (NULL, bsp, Seq_descr_molinfo,TRUE);
3859 if (vnp != NULL) {
3860 mip = (MolInfoPtr) vnp->data.ptrvalue;
3861 if (mip != NULL) {
3862 switch (mip->biomol) {
3863 case MOLECULE_TYPE_PRE_MRNA :
3864 rna = "precursorRNA";
3865 break;
3866 case MOLECULE_TYPE_MRNA :
3867 rna = "mRNA";
3868 break;
3869 case MOLECULE_TYPE_RRNA :
3870 rna = "rRNA";
3871 break;
3872 case MOLECULE_TYPE_TRNA :
3873 rna = "tRNA";
3874 break;
3875 case MOLECULE_TYPE_SNRNA :
3876 rna = "snRNA";
3877 break;
3878 case MOLECULE_TYPE_SCRNA :
3879 rna = "scRNA";
3880 break;
3881 case MOLECULE_TYPE_CRNA :
3882 rna = "cRNA";
3883 break;
3884 case MOLECULE_TYPE_SNORNA :
3885 rna = "snoRNA";
3886 break;
3887 case MOLECULE_TYPE_TRANSCRIBED_RNA :
3888 rna = "miscRNA";
3889 break;
3890 case MOLECULE_TYPE_NCRNA :
3891 rna = "ncRNA";
3892 break;
3893 case MOLECULE_TYPE_TMRNA :
3894 rna = "tmRNA";
3895 break;
3896 default :
3897 break;
3898 }
3899 }
3900 }
3901 len = StringLen (orp->taxname) + StringLen (gene) +
3902 StringLen (", ") + 30;
3903 str = (CharPtr) MemNew (len);
3904 if (str != NULL) {
3905 sprintf (str, "%s %s, %s", orp->taxname, gene, rna);
3906 }
3907 MemFree (gene);
3908 return str;
3909 }
3910
TrimPunctuationFromEnd(CharPtr str)3911 static CharPtr TrimPunctuationFromEnd (CharPtr str)
3912
3913 {
3914 Uchar ch; /* to use 8bit characters in multibyte languages */
3915 CharPtr dst;
3916 CharPtr ptr;
3917
3918 if (str != NULL && str [0] != '\0') {
3919 dst = NULL;
3920 ptr = str;
3921 ch = *ptr;
3922 while (ch != '\0') {
3923 if (ch == ' ' || ch == ';' || ch == ',' || ch == '~' || ch == '.') {
3924 if (dst == NULL) {
3925 dst = ptr;
3926 }
3927 } else {
3928 dst = NULL;
3929 }
3930 ptr++;
3931 ch = *ptr;
3932 }
3933 if (dst != NULL) {
3934 *dst = '\0';
3935 }
3936 }
3937 return str;
3938 }
3939
TrimNonPeriodPunctuationFromEnd(CharPtr str)3940 static CharPtr TrimNonPeriodPunctuationFromEnd (CharPtr str)
3941
3942 {
3943 Uchar ch; /* to use 8bit characters in multibyte languages */
3944 CharPtr dst;
3945 CharPtr ptr;
3946
3947 if (str != NULL && str [0] != '\0') {
3948 dst = NULL;
3949 ptr = str;
3950 ch = *ptr;
3951 while (ch != '\0') {
3952 if (ch == ' ' || ch == ';' || ch == ',' || ch == '~') {
3953 if (dst == NULL) {
3954 dst = ptr;
3955 }
3956 } else {
3957 dst = NULL;
3958 }
3959 ptr++;
3960 ch = *ptr;
3961 }
3962 if (dst != NULL) {
3963 *dst = '\0';
3964 }
3965 }
3966 return str;
3967 }
3968
FindProtDefLine(BioseqPtr bsp,Boolean extProtTitle)3969 static CharPtr FindProtDefLine(BioseqPtr bsp, Boolean extProtTitle)
3970 {
3971 SeqFeatPtr sfp = NULL /* , f */;
3972 ProtRefPtr prp;
3973 SeqFeatXrefPtr xref;
3974 GeneRefPtr grp=NULL;
3975 ValNodePtr vnp, /* v, */ syn;
3976 SeqLocPtr loc;
3977 CharPtr title = NULL, s, geneprod;
3978 /*
3979 Int4 diff_lowest = INT4_MAX, diff_current;
3980 */
3981 Int2 length = 0;
3982 SeqFeatPtr best_gene = NULL;
3983 Uint2 entityID;
3984 Boolean indexed;
3985 if (bsp == NULL) {
3986 return NULL;
3987 }
3988 entityID = ObjMgrGetEntityIDForPointer (bsp);
3989 indexed = (Boolean)SeqMgrFeaturesAreIndexed (entityID);
3990 sfp = NULL;
3991 if (indexed) {
3992 sfp = SeqMgrGetBestProteinFeature (bsp, NULL);
3993 } else {
3994 sfp = GetLongestProteinUnindexed (bsp);
3995 /*
3996 if (sfp == NULL) {
3997 sfp = GatherSeqFeatProt(bsp);
3998 }
3999 */
4000 }
4001 if (sfp != NULL) {
4002 prp = (ProtRefPtr) sfp->data.value.ptrvalue;
4003 if (prp && prp->name) {
4004 for (vnp=prp->name; vnp; vnp=vnp->next) {
4005 length += StringLen((CharPtr)vnp->data.ptrvalue) + 2;
4006 }
4007 s = title = (CharPtr) MemNew(length + 1);
4008 if (prp->name->data.ptrvalue) {
4009 sprintf(title, "%s",
4010 (CharPtr) prp->name->data.ptrvalue);
4011 }
4012 s += StringLen(title);
4013 if (extProtTitle) {
4014 for (vnp=prp->name->next; vnp; vnp=vnp->next) {
4015 sprintf(s, "; %s",
4016 (CharPtr) vnp->data.ptrvalue);
4017 s += StringLen((CharPtr)vnp->data.ptrvalue) + 2;
4018 }
4019 }
4020 TrimPunctuationFromEnd (title);
4021 /* if hypothetical protein, append locus_tag */
4022 if (StringICmp (title, "hypothetical protein") == 0) {
4023 sfp = NULL;
4024 if (indexed) {
4025 sfp = SeqMgrGetCDSgivenProduct (bsp, NULL);
4026 } else {
4027 /*
4028 sfp = GetCDSProtUnindexed (bsp);
4029 */
4030 sfp = GatherProtCDS(bsp);
4031 }
4032 if (sfp != NULL) {
4033 grp = SeqMgrGetGeneXref (sfp);
4034 if (grp == NULL) {
4035 loc = sfp->location;
4036 best_gene = NULL;
4037 if (indexed) {
4038 best_gene = SeqMgrGetOverlappingGene (loc, NULL);
4039 } else {
4040 best_gene = GetBestGeneUnindexed (loc, entityID);
4041 /*
4042 vnp = GatherGenesForCDS(loc);
4043 for (v=vnp; v; v=v->next) {
4044 f = (SeqFeatPtr) v->data.ptrvalue;
4045 diff_current = SeqLocAinB(loc, f->location);
4046 if (! diff_current) {
4047 best_gene = f;
4048 break;
4049 } else if (diff_current > 0) {
4050 if ((diff_lowest == -1) || (diff_current<diff_lowest)) {
4051 diff_lowest = diff_current;
4052 best_gene = f;
4053 }
4054 }
4055 }
4056 ValNodeFree(vnp);
4057 */
4058 }
4059 if (best_gene != NULL) {
4060 grp = (GeneRefPtr) best_gene->data.value.ptrvalue;
4061 }
4062 }
4063 }
4064 if (grp != NULL) {
4065 geneprod = NULL;
4066 if (grp->locus_tag != NULL) {
4067 geneprod = grp->locus_tag;
4068 }
4069 if (geneprod != NULL) {
4070 s = (CharPtr) MemNew (StringLen (geneprod) + StringLen (title) + 20);
4071 if (s != NULL) {
4072 sprintf (s, "%s %s", title, geneprod);
4073 MemFree (title);
4074 title = s;
4075 }
4076 }
4077 }
4078 }
4079 } else if (prp && prp->desc) {
4080 title = StringSave(prp->desc);
4081 } else if (prp && prp->activity) {
4082 if (prp->activity->data.ptrvalue) {
4083 title = StringSave (prp->activity->data.ptrvalue);
4084 }
4085 }
4086 }
4087 if (title == NULL) {
4088 sfp = NULL;
4089 if (indexed) {
4090 sfp = SeqMgrGetCDSgivenProduct (bsp, NULL);
4091 } else {
4092 /*
4093 sfp = GetCDSProtUnindexed (bsp);
4094 */
4095 sfp = GatherProtCDS(bsp);
4096 }
4097 if (sfp != NULL) {
4098 loc = sfp->location;
4099 for (xref = sfp->xref; xref; xref=xref->next) {
4100 if (xref->data.choice == SEQFEAT_GENE) {
4101 grp = (GeneRefPtr) xref->data.value.ptrvalue;
4102 }
4103 }
4104 if (grp) {
4105 geneprod = NULL;
4106 if (grp->locus != NULL) {
4107 geneprod = grp->locus;
4108 } else if (grp->syn != NULL) {
4109 syn = grp->syn;
4110 geneprod = (CharPtr) syn->data.ptrvalue;
4111 } else if (grp->desc != NULL) {
4112 geneprod = (CharPtr) grp->desc;
4113 }
4114 if (geneprod != NULL) {
4115 s = (CharPtr) MemNew(StringLen(geneprod) + 15);
4116 sprintf(s, "%s gene product", geneprod);
4117 title = s;
4118 }
4119 }
4120 if (title == NULL) {
4121 best_gene = NULL;
4122 if (indexed) {
4123 best_gene = SeqMgrGetOverlappingGene (loc, NULL);
4124 } else {
4125 best_gene = GetBestGeneUnindexed (loc, entityID);
4126 /*
4127 vnp = GatherGenesForCDS(loc);
4128 for (v=vnp; v; v=v->next) {
4129 f = (SeqFeatPtr) v->data.ptrvalue;
4130 diff_current = SeqLocAinB(loc, f->location);
4131 if (! diff_current) {
4132 best_gene = f;
4133 break;
4134 } else if (diff_current > 0) {
4135 if ((diff_lowest == -1) || (diff_current<diff_lowest)) {
4136 diff_lowest = diff_current;
4137 best_gene = f;
4138 }
4139 }
4140 }
4141 ValNodeFree(vnp);
4142 */
4143 }
4144 if (best_gene != NULL) {
4145 grp = (GeneRefPtr) best_gene->data.value.ptrvalue;
4146 if (grp) {
4147 geneprod = NULL;
4148 if (grp->locus != NULL) {
4149 geneprod = grp->locus;
4150 } else if (grp->syn != NULL) {
4151 syn = grp->syn;
4152 geneprod = (CharPtr) syn->data.ptrvalue;
4153 } else if (grp->desc != NULL) {
4154 geneprod = (CharPtr) grp->desc;
4155 }
4156 if (geneprod != NULL) {
4157 s = (CharPtr) MemNew(StringLen(geneprod) + 15);
4158 sprintf(s, "%s gene product", geneprod);
4159 title = s;
4160 }
4161 }
4162 }
4163 }
4164 }
4165 }
4166 if (title != NULL) {
4167 TrimPunctuationFromEnd (title);
4168 }
4169 if (title == NULL) {
4170 title = StringSave ("unnamed protein product");
4171 }
4172 return title;
4173 }
StrainNotAtEndOfTaxname(CharPtr name,CharPtr strain)4174 static Boolean StrainNotAtEndOfTaxname (CharPtr name, CharPtr strain)
4175 {
4176 size_t len;
4177 CharPtr ptr;
4178 char ch;
4179
4180 if (StringHasNoText (name) || StringHasNoText (strain)) return TRUE;
4181 ptr = StringChr (name, ' ');
4182 if (ptr == NULL) return TRUE;
4183 ptr++;
4184 ptr = StringChr (ptr, ' ');
4185 if (ptr == NULL) return TRUE;
4186 ptr++;
4187 ptr = StringISearch (ptr, strain);
4188 if (ptr == NULL) return TRUE;
4189 len = StringLen (strain);
4190 ptr += len;
4191 if (! StringHasNoText (ptr)) {
4192 if (StringCmp (ptr, "'") == 0) {
4193 ptr -= len + 1;
4194 if (*ptr == '\'') return FALSE;
4195 }
4196 return TRUE;
4197 }
4198 ptr -= len + 1;
4199 ch = *ptr;
4200 if (ch != ' ' && ch != '-' && ch != ':' && ch != ';' && ch != '.') return TRUE;
4201 return FALSE;
4202 }
GetNumClones(CharPtr str)4203 static Int2 GetNumClones (CharPtr str)
4204 {
4205 Char ch;
4206 Int2 count;
4207 if (StringHasNoText (str)) return 0;
4208 count = 1;
4209 ch = *str;
4210 while (ch != '\0') {
4211 if (ch == ';') {
4212 count++;
4213 }
4214 str++;
4215 ch = *str;
4216 }
4217 return count;
4218 }
SimpleSegSeqTitle(BioseqPtr bsp)4219 static CharPtr SimpleSegSeqTitle (BioseqPtr bsp)
4220 {
4221 BioSourcePtr biop;
4222 SeqMgrFeatContext ccontext;
4223 SeqFeatPtr cds;
4224 CharPtr clone = NULL;
4225 CharPtr complete = "gene, complete cds";
4226 SeqMgrDescContext dcontext;
4227 SeqMgrFeatContext gcontext;
4228 SeqFeatPtr gene;
4229 GeneRefPtr grp;
4230 CharPtr isolate = NULL;
4231 CharPtr label = NULL;
4232 size_t len;
4233 CharPtr locus = NULL;
4234 OrgModPtr mod;
4235 CharPtr modifier = NULL;
4236 Int2 numclones;
4237 ObjMgrDataPtr omdp;
4238 ObjMgrPtr omp;
4239 OrgNamePtr onp;
4240 OrgRefPtr orp;
4241 CharPtr organism = NULL;
4242 CharPtr product = NULL;
4243 SeqDescrPtr sdp;
4244 SubSourcePtr ssp;
4245 CharPtr str;
4246 CharPtr strain = NULL;
4247 CharPtr title;
4248 ValNodePtr vnp;
4249 Uint2 entityID;
4250
4251 if (bsp == NULL) return NULL;
4252 /* check to see if feature indexing has been called */
4253 omdp = (ObjMgrDataPtr) bsp->omdp;
4254 if (omdp == NULL) return NULL;
4255 omp = ObjMgrReadLock ();
4256 omdp = ObjMgrFindTop (omp, omdp);
4257 ObjMgrUnlock ();
4258 if (omdp == NULL) return NULL;
4259 /*
4260 if (omdp->indexed == 0) return NULL;
4261 */
4262
4263 entityID = ObjMgrGetEntityIDForPointer (bsp);
4264 if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
4265 SeqMgrIndexFeatures (entityID, NULL);
4266 }
4267
4268 sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
4269 if (sdp == NULL) return NULL;
4270 biop = (BioSourcePtr) sdp->data.ptrvalue;
4271 if (biop == NULL) return NULL;
4272 orp = biop->org;
4273 if (orp != NULL && (! StringHasNoText (orp->taxname))) {
4274 organism = orp->taxname;
4275 onp = orp->orgname;
4276 if (onp != NULL) {
4277 mod = onp->mod;
4278 if (mod != NULL) {
4279 if (mod->subtype == ORGMOD_strain) {
4280 if (mod->subname != NULL && StrainNotAtEndOfTaxname (organism, mod->subname)) {
4281 strain = (CharPtr) mod->subname;
4282 }
4283 } else if (mod->subtype == ORGMOD_isolate) {
4284 isolate = (CharPtr) mod->subname;
4285 }
4286 }
4287 }
4288 } else {
4289 organism = "Unknown";
4290 }
4291 for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
4292 if (ssp->subtype == SUBSRC_clone) {
4293 if (ssp->name != NULL) {
4294 numclones = GetNumClones (ssp->name);
4295 if (numclones < 4) {
4296 clone = (CharPtr) ssp->name;
4297 }
4298 }
4299 }
4300 }
4301 cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext);
4302 if (cds != NULL) {
4303 if (cds->partial) {
4304 complete = "gene, partial cds";
4305 }
4306 product = ccontext.label;
4307 grp = SeqMgrGetGeneXref (cds);
4308 if (grp != NULL) {
4309 if (! StringHasNoText (grp->locus)) {
4310 locus = grp->locus;
4311 } else {
4312 vnp = grp->syn;
4313 if (vnp != NULL) {
4314 str = (CharPtr) vnp->data.ptrvalue;
4315 if (! StringHasNoText (str)) {
4316 locus = str;
4317 }
4318 }
4319 }
4320 }
4321 if (locus == NULL) {
4322 gene = SeqMgrGetOverlappingGene (cds->location, &gcontext);
4323 if (gene != NULL) {
4324 locus = gcontext.label;
4325 }
4326 }
4327 } else {
4328 if (StringDoesHaveText (strain)) {
4329 modifier = strain;
4330 label = "strain";
4331 } else if (StringDoesHaveText (clone)) {
4332 modifier = clone;
4333 label = "clone";
4334 } else if (StringDoesHaveText (isolate)) {
4335 modifier = isolate;
4336 label = "isolate";
4337 }
4338 }
4339 len = StringLen (organism) + StringLen (label) + StringLen (modifier) +
4340 StringLen (product) + StringLen (locus) + StringLen (complete);
4341 title = (CharPtr) MemNew (len + 10);
4342 if (organism != NULL) {
4343 StringCat (title, organism);
4344 }
4345 if (modifier != NULL) {
4346 StringCat (title, " ");
4347 StringCat (title, label);
4348 StringCat (title, " ");
4349 StringCat (title, modifier);
4350 }
4351 if (product != NULL) {
4352 StringCat (title, " ");
4353 StringCat (title, product);
4354 }
4355 if (locus != NULL) {
4356 StringCat (title, " (");
4357 StringCat (title, locus);
4358 StringCat (title, ")");
4359 }
4360 if (product != NULL || locus != NULL) {
4361 StringCat (title, " ");
4362 StringCat (title, complete);
4363 }
4364 TrimSpacesAroundString (title);
4365 return title;
4366 }
4367
UseOrgMods(BioseqPtr bsp,CharPtr suffix,Uint1 tech,Boolean htgs_pooled_multiclone)4368 static CharPtr UseOrgMods(BioseqPtr bsp, CharPtr suffix, Uint1 tech, Boolean htgs_pooled_multiclone)
4369 {
4370 ItemInfoPtr iip = NULL;
4371 ValNodePtr vnp;
4372 BioSourcePtr biop;
4373 OrgModPtr mod;
4374 OrgNamePtr onp;
4375 OrgRefPtr orp;
4376 SubSourcePtr ssp;
4377 Char ch;
4378 CharPtr name = NULL, chr = NULL, str = NULL,
4379 cln = NULL, map = NULL, pls = NULL, def = NULL, ptr;
4380 Int2 deflen = 0;
4381 Int2 numclones;
4382 if (bsp == NULL) {
4383 return NULL;
4384 }
4385 if ((vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_source,FALSE)) == NULL) {
4386 return NULL;
4387 }
4388 biop = (BioSourcePtr) vnp->data.ptrvalue;
4389 orp = biop->org;
4390 if (orp && orp->taxname) {
4391 name = StringSave(orp->taxname);
4392 deflen += StringLen(orp->taxname);
4393 }
4394 for (ssp = biop->subtype; ssp; ssp=ssp->next) {
4395 if (ssp->subtype == SUBSRC_chromosome) { /* chromosome */
4396 if (ssp->name != NULL) {
4397 chr = (CharPtr) MemNew(StringLen(ssp->name) + 13);
4398 deflen += StringLen(ssp->name) + 13;
4399 sprintf(chr, " chromosome %s", ssp->name);
4400 }
4401 } else if (ssp->subtype == SUBSRC_clone) { /* clone */
4402 if (ssp->name != NULL) {
4403 numclones = GetNumClones (ssp->name);
4404 if (htgs_pooled_multiclone) {
4405 cln = (CharPtr) MemNew (30);
4406 sprintf (cln, ", pooled multiple clones");
4407 deflen += StringLen (cln) + 2;
4408 } else if (numclones > 3) {
4409 cln = (CharPtr) MemNew (20);
4410 sprintf (cln, ", %d clones", (int) numclones);
4411 deflen += StringLen (cln) + 2;
4412 } else {
4413 cln = (CharPtr) MemNew(StringLen(ssp->name) + 8);
4414 deflen += StringLen(ssp->name) + 8;
4415 sprintf(cln, " clone %s", ssp->name);
4416 }
4417 }
4418 } else if (ssp->subtype == SUBSRC_map) { /* map */
4419 if (ssp->name != NULL) {
4420 map = (CharPtr) MemNew(StringLen(ssp->name) + 7);
4421 deflen += StringLen(ssp->name) + 7;
4422 sprintf(map, " map %s", ssp->name);
4423 }
4424 } else if (ssp->subtype == SUBSRC_plasmid_name) { /* plasmid name */
4425 if (ssp->name != NULL) {
4426 pls = (CharPtr) MemNew(StringLen(ssp->name) + 10);
4427 deflen += StringLen(ssp->name) + 10;
4428 sprintf(pls, " plasmid %s", ssp->name);
4429 }
4430 }
4431 }
4432 if (orp != NULL) {
4433 onp = orp->orgname;
4434 if (onp != NULL) {
4435 for (mod = onp->mod; mod != NULL; mod = mod->next) {
4436 if (mod->subtype != ORGMOD_strain) continue; /* strain */
4437 if (StringDoesHaveText (str)) continue;
4438 if (mod->subname != NULL && StrainNotAtEndOfTaxname (name, mod->subname)) {
4439 str = (CharPtr) MemNew(StringLen(mod->subname) + 9);
4440 deflen += StringLen(mod->subname) + 9;
4441 sprintf(str, " strain %s", mod->subname);
4442 ptr = StringChr (str, ';');
4443 if (ptr != NULL) {
4444 *ptr = '\0';
4445 }
4446 TrimNonPeriodPunctuationFromEnd (str);
4447 }
4448 }
4449 }
4450 }
4451 deflen += StringLen (suffix) + 2;
4452 def = (CharPtr) MemNew(deflen+1);
4453 if (def == NULL) return NULL;
4454 if (name) {
4455 def = StringCat(def, name);
4456 MemFree(name);
4457 }
4458 if (str) {
4459 def = StringCat(def, str);
4460 MemFree(str);
4461 }
4462 if (chr) {
4463 def = StringCat(def, chr);
4464 MemFree(chr);
4465 }
4466 if (cln) {
4467 def = StringCat(def, cln);
4468 MemFree(cln);
4469 }
4470 if (map) {
4471 def = StringCat(def, map);
4472 MemFree(map);
4473 }
4474 if (pls) {
4475 if (tech == MI_TECH_wgs) {
4476 def = StringCat(def, pls);
4477 }
4478 MemFree(pls);
4479 }
4480 if (suffix) {
4481 def = StringCat(def, " ");
4482 def = StringCat(def, suffix);
4483 }
4484 TrimSpacesAroundString (def);
4485 ch = def [0];
4486 def [0] = TO_UPPER (ch);
4487 return def;
4488 }
4489
4490 /*
4491 The following lists need endogenous virus, hydrogenosome, chromosome, and chromatophore
4492 */
4493
4494 static CharPtr organelleByItself [] = {
4495 NULL,
4496 NULL,
4497 "chloroplast",
4498 "chromoplast",
4499 "kinetoplast",
4500 "mitochondrion",
4501 "plastid",
4502 "macronuclear",
4503 "extrachromosomal",
4504 "plasmid",
4505 NULL,
4506 NULL,
4507 "cyanelle",
4508 "provirus",
4509 "virus",
4510 "nucleomorph",
4511 "apicoplast",
4512 "leucoplast",
4513 "protoplast",
4514 NULL,
4515 NULL,
4516 NULL,
4517 NULL
4518 };
4519 static CharPtr organelleWithPlasmid [] = {
4520 NULL,
4521 NULL,
4522 "chloroplast",
4523 "chromoplast",
4524 "kinetoplast",
4525 "mitochondrial",
4526 "plastid",
4527 "macronuclear",
4528 "extrachrom",
4529 "plasmid",
4530 NULL,
4531 NULL,
4532 "cyanelle",
4533 "proviral",
4534 "virus",
4535 "nucleomorph",
4536 "apicoplast",
4537 "leucoplast",
4538 "protoplast",
4539 NULL,
4540 NULL,
4541 NULL,
4542 NULL
4543 };
4544 static CharPtr organelleForWGS [] = {
4545 NULL,
4546 NULL,
4547 "chloroplast",
4548 "chromoplast",
4549 "kinetoplast",
4550 "mitochondrial",
4551 "plastid",
4552 "",
4553 "",
4554 "",
4555 "",
4556 "",
4557 "cyanelle",
4558 "proviral",
4559 "virus",
4560 "",
4561 "apicoplast",
4562 "leucoplast",
4563 "proplastid",
4564 "endogenous virus",
4565 "hydrogenosome",
4566 "chromosome",
4567 "chromatophore"
4568 };
4569
4570 const Int4 kNumWGSOrganelles = sizeof (organelleForWGS) / sizeof (CharPtr);
4571
4572
LowercasePlasmidOrElement(CharPtr def)4573 static void LowercasePlasmidOrElement (CharPtr def)
4574 {
4575 CharPtr ptr;
4576 if (StringHasNoText (def)) return;
4577 def++;
4578 ptr = StringISearch (def, "plasmid");
4579 while (ptr != NULL) {
4580 if (*ptr == 'P') {
4581 *ptr = 'p';
4582 }
4583 ptr = StringISearch (ptr + 7, "plasmid");
4584 }
4585 ptr = StringISearch (def, "element");
4586 while (ptr != NULL) {
4587 if (*ptr == 'E') {
4588 *ptr = 'e';
4589 }
4590 ptr = StringISearch (ptr + 7, "element");
4591 }
4592 }
4593
4594
MakeCompleteChromTitle(BioseqPtr bsp,Uint1 biomol,Uint1 completeness)4595 NLM_EXTERN CharPtr MakeCompleteChromTitle (BioseqPtr bsp, Uint1 biomol, Uint1 completeness)
4596 {
4597 CharPtr completeseq = ", complete sequence";
4598 CharPtr completegen = ", complete genome";
4599 ItemInfoPtr iip = NULL;
4600 ValNodePtr vnp;
4601 BioSourcePtr biop;
4602 OrgRefPtr orp;
4603 SubSourcePtr ssp;
4604 CharPtr name = NULL, chr = NULL, orgnl = NULL,
4605 seg = NULL, pls = NULL, def = NULL;
4606 Int2 deflen = 80; /* starts with space for all fixed text */
4607 Char ch;
4608 Boolean plasmid;
4609 Uint1 genome;
4610 if (bsp == NULL) {
4611 return NULL;
4612 }
4613 if ((vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_source,TRUE)) == NULL) {
4614 return NULL;
4615 }
4616 biop = (BioSourcePtr) vnp->data.ptrvalue;
4617 if (biop == NULL) {
4618 return NULL;
4619 }
4620 orp = biop->org;
4621 if (orp == NULL || orp->taxname == NULL) {
4622 return NULL;
4623 }
4624 name = orp->taxname;
4625 deflen += StringLen(orp->taxname);
4626 genome = biop->genome;
4627 plasmid = (Boolean) (biop->genome == GENOME_plasmid);
4628 for (ssp = biop->subtype; ssp; ssp=ssp->next) {
4629 if (ssp->subtype == SUBSRC_chromosome) {
4630 if (ssp->name != NULL) {
4631 chr = ssp->name;
4632 deflen += StringLen(ssp->name);
4633 }
4634 } else if (ssp->subtype == SUBSRC_segment) {
4635 if (ssp->name != NULL) {
4636 seg = ssp->name;
4637 deflen += StringLen(ssp->name);
4638 }
4639 } else if (ssp->subtype == SUBSRC_plasmid_name) {
4640 if (ssp->name != NULL) {
4641 pls = ssp->name;
4642 deflen += StringLen(ssp->name);
4643 }
4644 }
4645 }
4646 if (genome < kNumWGSOrganelles) {
4647 if (pls != NULL) {
4648 orgnl = organelleWithPlasmid [genome];
4649 } else {
4650 orgnl = organelleByItself [genome];
4651 }
4652 if (StringISearch (name, "virus") != NULL || StringISearch (name, "phage") != NULL) {
4653 if (genome == GENOME_proviral || genome == GENOME_virion) {
4654 orgnl = NULL;
4655 }
4656 }
4657 }
4658 if (completeness == 2 ||
4659 completeness == 3 ||
4660 completeness == 4 ||
4661 completeness == 5) {
4662 /* remove "complete" component */
4663 completeseq = ", partial sequence";
4664 completegen = ", genome";
4665 }
4666 def = (CharPtr) MemNew(deflen+1);
4667 if (StringISearch (name, "plasmid") != NULL) {
4668 StringCat(def, name);
4669 StringCat (def, completeseq);
4670 ch = *def;
4671 *def = TO_UPPER (ch);
4672 LowercasePlasmidOrElement (def);
4673 return def;
4674 } else if (plasmid) {
4675 if (name && (! pls)) {
4676 StringCat (def, name);
4677 StringCat (def, " unnamed plasmid");
4678 StringCat (def, completeseq);
4679 ch = *def;
4680 *def = TO_UPPER (ch);
4681 return def;
4682 }
4683 if (pls) {
4684 if (name) {
4685 StringCat (def, name);
4686 StringCat (def, " ");
4687 }
4688 if (StringISearch (pls, "plasmid") == NULL && StringISearch (pls, "element") == NULL) {
4689 StringCat(def, "plasmid ");
4690 }
4691 StringCat (def, pls);
4692 StringCat (def, completeseq);
4693 ch = *def;
4694 *def = TO_UPPER (ch);
4695 LowercasePlasmidOrElement (def);
4696 return def;
4697 }
4698 } else if (pls) {
4699 if (name) {
4700 StringCat (def, name);
4701 StringCat (def, " ");
4702 }
4703 if (orgnl != NULL) {
4704 StringCat (def, orgnl);
4705 StringCat (def, " ");
4706 }
4707 if (StringISearch (pls, "plasmid") == NULL && StringISearch (pls, "element") == NULL) {
4708 StringCat (def, "plasmid ");
4709 }
4710 StringCat (def, pls);
4711 StringCat (def, completeseq);
4712 ch = *def;
4713 *def = TO_UPPER (ch);
4714 LowercasePlasmidOrElement (def);
4715 return def;
4716 } else if (name) {
4717 StringCat (def, name);
4718 }
4719 if (orgnl != NULL) {
4720 if (chr != NULL) {
4721 StringCat (def, " ");
4722 StringCat (def, orgnl);
4723 StringCat (def, " chromosome ");
4724 StringCat(def, chr);
4725 StringCat (def, completeseq);
4726 ch = *def;
4727 *def = TO_UPPER (ch);
4728 return def;
4729 }
4730 StringCat (def, " ");
4731 StringCat (def, orgnl);
4732 StringCat (def, completegen);
4733 ch = *def;
4734 *def = TO_UPPER (ch);
4735 return def;
4736 }
4737 if (seg != NULL) {
4738 StringCat (def, " ");
4739 if (StringStr (seg, "DNA") == NULL &&
4740 StringStr (seg, "RNA") == NULL &&
4741 StringStr (seg, "segment") == NULL &&
4742 StringStr (seg, "Segment") == NULL) {
4743 StringCat (def, "segment ");
4744 }
4745 StringCat(def, seg);
4746 StringCat (def, completeseq);
4747 ch = *def;
4748 *def = TO_UPPER (ch);
4749 return def;
4750 }
4751 if (chr != NULL) {
4752 StringCat (def, " chromosome ");
4753 StringCat(def, chr);
4754 StringCat (def, completeseq);
4755 ch = *def;
4756 *def = TO_UPPER (ch);
4757 return def;
4758 }
4759 StringCat (def, completegen);
4760 ch = *def;
4761 *def = TO_UPPER (ch);
4762 return def;
4763 }
NotSpecialTaxName(CharPtr taxname)4764 static Boolean NotSpecialTaxName (CharPtr taxname)
4765 {
4766 if (StringHasNoText (taxname)) return TRUE;
4767 if (StringICmp (taxname, "synthetic construct") == 0) return FALSE;
4768 if (StringICmp (taxname, "artificial sequence") == 0) return FALSE;
4769 if (StringStr (taxname, "vector") != NULL) return FALSE;
4770 if (StringStr (taxname, "Vector") != NULL) return FALSE;
4771 return TRUE;
4772 }
DoTpaPrefix(CharPtr title,CharPtr PNTR ttl,CharPtr PNTR pfx,Boolean is_tpa,Boolean tpa_exp,Boolean tpa_inf,Boolean is_tsa)4773 static Boolean DoTpaPrefix (
4774 CharPtr title,
4775 CharPtr PNTR ttl,
4776 CharPtr PNTR pfx,
4777 Boolean is_tpa,
4778 Boolean tpa_exp,
4779 Boolean tpa_inf,
4780 Boolean is_tsa
4781 )
4782 {
4783 /* must be called with ttl and pfx pointing to stack variables */
4784 /* string literals declared here will persist and can be passed to calling function */
4785 *ttl = title;
4786 *pfx = NULL;
4787 if (title == NULL || *title == '\0') return FALSE;
4788 if (is_tsa) {
4789 if (StringNICmp (title, "TSA: ", 5) == 0) return FALSE;
4790 *pfx = "TSA: ";
4791 return TRUE;
4792 } else if (is_tpa) {
4793 if (tpa_exp) {
4794 if (StringNICmp (title, "TPA_exp: ", 9) == 0) return FALSE;
4795 *pfx = "TPA_exp: ";
4796 if (StringNICmp (title, "TPA: ", 5) == 0) {
4797 *ttl = title + 5;
4798 }
4799 return TRUE;
4800 } else if (tpa_inf) {
4801 if (StringNICmp (title, "TPA_inf: ", 9) == 0) return FALSE;
4802 *pfx = "TPA_inf: ";
4803 if (StringNICmp (title, "TPA: ", 5) == 0) {
4804 *ttl = title + 5;
4805 }
4806 return TRUE;
4807 } else {
4808 if (StringNICmp (title, "TPA: ", 5) == 0) return FALSE;
4809 *pfx = "TPA: ";
4810 return TRUE;
4811 }
4812 }
4813 return FALSE;
4814 }
4815
4816 /*****************************************************************************
4817 *
4818 * CreateDefLine(iip, bsp, buf, buflen, tech)
4819 * Finds or makes a FASTA format defline using Gather functions
4820 * buf should be very long if possible
4821 * function truncates if buf not long enough
4822 * a few deflines are longer than 255
4823 *
4824 * ItemInfoPtr iip is used in flat file generator to keep entityId, itemId
4825 * and itemtype
4826 *****************************************************************************/
CreateDefLineExEx(ItemInfoPtr iip,BioseqPtr bsp,CharPtr buf,Uint4 buflen,Uint1 tech,CharPtr accession,CharPtr organism,Boolean ignoreTitle,Boolean extProtTitle)4827 NLM_EXTERN Boolean CreateDefLineExEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr buf, Uint4 buflen, Uint1 tech,
4828 CharPtr accession, CharPtr organism, Boolean ignoreTitle, Boolean extProtTitle)
4829 {
4830 ValNodePtr vnp = NULL;
4831 CharPtr tmp = NULL, title = NULL, ttl = NULL, pfx = NULL;
4832 PdbBlockPtr pbp;
4833 PatentSeqIdPtr psip;
4834 PDBSeqIdPtr pdbip;
4835 Uint4 diff, phase, i;
4836 Boolean doit;
4837 Int4 num_segs, num_gaps;
4838 static Char tbuf[128];
4839 static CharPtr htgs[2] = {
4840 "unordered", "ordered" };
4841 static CharPtr htg_phrase[3] = {
4842 "LOW-PASS SEQUENCE SAMPLING",
4843 "WORKING DRAFT SEQUENCE",
4844 "*** SEQUENCING IN PROGRESS ***" };
4845 Boolean htg_tech = FALSE, htgs_draft = FALSE, htgs_cancelled = FALSE,
4846 htgs_pooled_multiclone = FALSE, is_nc = FALSE, is_nm = FALSE,
4847 is_nr = FALSE, is_tpa = FALSE, tpa_exp = FALSE, tpa_inf = FALSE,
4848 is_tsa = FALSE;
4849 MolInfoPtr mip;
4850 GBBlockPtr gbp = NULL;
4851 EMBLBlockPtr ebp = NULL;
4852 ValNodePtr keywords = NULL;
4853 Boolean wgsmaster = FALSE;
4854 CharPtr suffix = NULL;
4855 SeqIdPtr sip;
4856 TextSeqIdPtr tsip;
4857 DbtagPtr general = NULL, dbt;
4858 ObjectIdPtr oip;
4859 ItemInfo ii;
4860 BioSourcePtr biop = NULL;
4861 OrgRefPtr orp;
4862 CharPtr taxname = NULL;
4863 SeqMgrDescContext dcontext;
4864 SeqMgrFeatContext fcontext;
4865 SeqFeatPtr sfp, src;
4866 Uint2 entityID;
4867 Uint1 genome;
4868 CharPtr orgnl = NULL;
4869
4870 if ((bsp == NULL) || (buf == NULL) || buflen == 0) return FALSE;
4871 /* now using GetNextDescriptorUnindexed, so need to have called AssignIDsInEntityEx */
4872 if (bsp->idx.entityID == 0) {
4873 entityID = ObjMgrGetEntityIDForPointer (bsp);
4874 if (entityID != 0) {
4875 AssignIDsInEntityEx (entityID, 0, NULL, NULL);
4876 }
4877 }
4878 entityID = bsp->idx.entityID;
4879 for (sip = bsp->id; sip != NULL; sip = sip->next) {
4880 switch (sip->choice) {
4881 case SEQID_OTHER :
4882 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
4883 if (tsip != NULL && tsip->accession != NULL) {
4884 if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
4885 is_nc = TRUE;
4886 } else if (StringNICmp (tsip->accession, "NM_", 3) == 0) {
4887 is_nm = TRUE;
4888 } else if (StringNICmp (tsip->accession, "NR_", 3) == 0) {
4889 is_nr = TRUE;
4890 }
4891 }
4892 break;
4893 case SEQID_TPG :
4894 case SEQID_TPE :
4895 case SEQID_TPD :
4896 is_tpa = TRUE;
4897 break;
4898 case SEQID_GENERAL :
4899 dbt = (DbtagPtr) sip->data.ptrvalue;
4900 if (dbt != NULL && (! IsSkippableDbtag (dbt))) {
4901 general = dbt;
4902 }
4903 break;
4904 case SEQID_GENBANK :
4905 case SEQID_EMBL :
4906 case SEQID_DDBJ :
4907 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
4908 if (tsip != NULL && tsip->accession != NULL) {
4909 if (StringLen (tsip->accession) == 12) {
4910 if (StringCmp (tsip->accession + 6, "000000") == 0) {
4911 wgsmaster = TRUE;
4912 }
4913 } else if (StringLen (tsip->accession) == 13) {
4914 if (StringCmp (tsip->accession + 6, "0000000") == 0) {
4915 wgsmaster = TRUE;
4916 }
4917 } else if (StringLen (tsip->accession) == 14) {
4918 if (StringCmp (tsip->accession + 6, "00000000") == 0) {
4919 wgsmaster = TRUE;
4920 }
4921 }
4922 }
4923 break;
4924 case SEQID_GPIPE :
4925 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
4926 break;
4927 default :
4928 break;
4929 }
4930 }
4931 buflen--;
4932 buf[buflen] = '\0';
4933 tbuf[0] = '\0';
4934
4935 if (tech == 0) {
4936 vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_molinfo, NULL);
4937 if (vnp != NULL) {
4938 mip = (MolInfoPtr) vnp->data.ptrvalue;
4939 if (mip != NULL) {
4940 tech = mip->tech;
4941 }
4942 }
4943 }
4944
4945 if (((tech >= MI_TECH_htgs_1) && (tech <= MI_TECH_htgs_3)) ||
4946 (tech == MI_TECH_htgs_0)) {
4947 htg_tech = TRUE;
4948 } else if (tech == MI_TECH_tsa) {
4949 is_tsa = TRUE;
4950 }
4951 if (iip == NULL && accession != NULL) {
4952 diff = LabelCopyExtra(buf, accession, buflen, "(", ") ");
4953 buflen -= diff;
4954 buf += diff;
4955 }
4956 diff = 0;
4957 if (htg_tech || is_tpa) {
4958 vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_genbank,TRUE);
4959 if (vnp != NULL) {
4960 gbp = (GBBlockPtr) vnp->data.ptrvalue;
4961 if (gbp != NULL) {
4962 keywords = gbp->keywords;
4963 }
4964 }
4965 vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_embl,TRUE);
4966 if (vnp != NULL) {
4967 ebp = (EMBLBlockPtr) vnp->data.ptrvalue;
4968 if (ebp != NULL) {
4969 keywords = ebp->keywords;
4970 }
4971 }
4972 }
4973 if (keywords != NULL) {
4974 for (vnp = keywords; vnp != NULL; vnp = vnp->next) {
4975 if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_DRAFT") == 0) {
4976 htgs_draft = TRUE;
4977 } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_CANCELLED") == 0) {
4978 htgs_cancelled = TRUE;
4979 } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "HTGS_POOLED_MULTICLONE") == 0 && htg_tech) {
4980 htgs_pooled_multiclone = TRUE;
4981 } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:experimental") == 0) {
4982 tpa_exp = TRUE;
4983 } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:inferential") == 0) {
4984 tpa_inf = TRUE;
4985 }
4986 }
4987 }
4988 if (! ignoreTitle)
4989 {
4990 vnp=GatherDescrOnBioseq(iip, bsp, Seq_descr_title,TRUE);
4991 if (vnp != NULL)
4992 title = StringSaveNoNull((CharPtr)vnp->data.ptrvalue);
4993 if (title != NULL) {
4994 TrimSpacesAroundString (title);
4995 TrimPunctuationFromEnd (title);
4996 }
4997 }
4998 if (tech == MI_TECH_htgs_0 || tech == MI_TECH_htgs_1 || tech == MI_TECH_htgs_2) {
4999 MemFree(title); /* manufacture all HTG titles */
5000 title = NULL;
5001 if (iip != NULL) {
5002 iip->entityID = 0;
5003 iip->itemID = 0;
5004 iip->itemtype = 0;
5005 }
5006 if (title == NULL || *title == '\0') {
5007 title = UseOrgMods(bsp, NULL, tech, htgs_pooled_multiclone);
5008 organism = NULL;
5009 }
5010 } else if (tech == MI_TECH_est || tech == MI_TECH_sts || tech == MI_TECH_survey) {
5011 if (title == NULL || *title == '\0') {
5012 title = UseOrgMods(bsp, NULL, tech, FALSE);
5013 organism = NULL;
5014 }
5015 } else if (tech == MI_TECH_wgs) {
5016 if (title == NULL || *title == '\0') {
5017 if (! wgsmaster) {
5018 if (general != NULL) {
5019 oip = general->tag;
5020 if (oip != NULL) {
5021 if (! StringHasNoText (oip->str)) {
5022 suffix = oip->str;
5023 }
5024 }
5025 }
5026 }
5027 title = UseOrgMods(bsp, suffix, tech, FALSE);
5028 organism = NULL;
5029 }
5030 } else if (tech == MI_TECH_tsa) {
5031 if (title == NULL || *title == '\0') {
5032 if (general != NULL) {
5033 oip = general->tag;
5034 if (oip != NULL) {
5035 if (! StringHasNoText (oip->str)) {
5036 suffix = oip->str;
5037 }
5038 }
5039 }
5040 title = UseOrgMods(bsp, suffix, tech, FALSE);
5041 organism = NULL;
5042 }
5043 } else if (is_nc && title == NULL) {
5044 /* manufacture complete chromosome titles if not already present */
5045 vnp = GatherDescrOnBioseq (&ii, bsp, Seq_descr_molinfo,TRUE);
5046 if (vnp != NULL) {
5047 mip = (MolInfoPtr) vnp->data.ptrvalue;
5048 if (mip != NULL &&
5049 (mip->biomol == MOLECULE_TYPE_GENOMIC || mip->biomol == MOLECULE_TYPE_OTHER_GENETIC_MATERIAL) /* && mip->completeness == 1 */) {
5050 title = MakeCompleteChromTitle (bsp, mip->biomol, mip->completeness);
5051 organism = NULL;
5052 if (iip != NULL) {
5053 iip->entityID = ii.entityID;
5054 iip->itemID = ii.itemID;
5055 iip->itemtype = ii.itemtype;
5056 }
5057 }
5058 }
5059 } else if (is_nm && title == NULL) {
5060 title = FindNMDefLine (bsp);
5061 if (title != NULL && iip != NULL) {
5062 iip->entityID = 0;
5063 iip->itemID = 0;
5064 iip->itemtype = 0;
5065 }
5066 } else if (is_nr && title == NULL) {
5067 title = FindNRDefLine (bsp);
5068 if (title != NULL && iip != NULL) {
5069 iip->entityID = 0;
5070 iip->itemID = 0;
5071 iip->itemtype = 0;
5072 }
5073 }
5074 /* some titles may have zero length */
5075 if (title != NULL && *title != '\0') {
5076 ttl = title;
5077 pfx = NULL;
5078 if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5079 diff = LabelCopy (buf, pfx, buflen);
5080 buflen -= diff;
5081 buf += diff;
5082 }
5083 diff = LabelCopy (buf, ttl, buflen);
5084 /* remove trailing blanks BUT NOT periods */
5085 tmp = buf + diff - 1; /* point at last character */
5086 while (tmp >= buf && ((*tmp <= ' ') /* || (*tmp == '.') */)) {
5087 *tmp = '\0';
5088 tmp--;
5089 diff--;
5090 }
5091 } else if ((vnp = GatherDescrOnBioseq(iip, bsp, Seq_descr_pdb,TRUE)) != NULL) {
5092 pbp = (PdbBlockPtr)(vnp->data.ptrvalue);
5093 for (vnp = bsp->id; vnp != NULL; vnp = vnp->next) {
5094 if (vnp->choice == SEQID_PDB) {
5095 pdbip = (PDBSeqIdPtr)(vnp->data.ptrvalue);
5096 if (pdbip && pdbip->chain > 32) {
5097 sprintf(tbuf, "Chain %c, ", pdbip->chain);
5098 diff = LabelCopy(buf, tbuf, buflen);
5099 buflen -= diff;
5100 buf += diff;
5101 break;
5102 }
5103 }
5104 }
5105 if (pbp && pbp->compound) {
5106 tmp = StringSave ((CharPtr)(pbp->compound->data.ptrvalue));
5107 TrimNonPeriodPunctuationFromEnd (tmp);
5108 diff = LabelCopy(buf, tmp, buflen);
5109 MemFree (tmp);
5110 }
5111 } else {
5112 for (vnp = bsp->id; vnp != NULL; vnp = vnp->next) {
5113 if (vnp->choice == SEQID_PATENT)
5114 {
5115 psip = (PatentSeqIdPtr)(vnp->data.ptrvalue);
5116 if (psip) {
5117 sprintf(tbuf, "Sequence %d from Patent %s %s",
5118 (int)psip->seqid, psip->cit->country, psip->cit->number);
5119 diff = LabelCopy(buf, tbuf, buflen);
5120 break;
5121 }
5122 }
5123 }
5124 if (vnp == NULL) {
5125 if (ISA_aa(bsp->mol)) {
5126 title = FindProtDefLine(bsp, extProtTitle);
5127 vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
5128 if (vnp != NULL && organism == NULL) {
5129 biop = (BioSourcePtr) vnp->data.ptrvalue;
5130 if (biop != NULL) {
5131 orp = biop->org;
5132 if (orp != NULL) {
5133 taxname = orp->taxname;
5134 }
5135 }
5136 if (taxname == NULL || NotSpecialTaxName (taxname)) {
5137 if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
5138 SeqMgrIndexFeatures (entityID, NULL);
5139 }
5140 sfp = SeqMgrGetCDSgivenProduct (bsp, NULL);
5141 if (sfp != NULL) {
5142 src = SeqMgrGetOverlappingSource (sfp->location, &fcontext);
5143 if (src != NULL) {
5144 biop = (BioSourcePtr) src->data.value.ptrvalue;
5145 if (biop != NULL) {
5146 orp = biop->org;
5147 if (orp != NULL) {
5148 taxname = orp->taxname;
5149 }
5150 }
5151 }
5152 }
5153 }
5154 }
5155 }
5156 if (title != NULL) {
5157 /*
5158 if (! StringHasNoText (taxname)) {
5159 diff = LabelCopy(buf, taxname, buflen);
5160 buflen -= diff;
5161 buf += diff;
5162 diff = LabelCopy(buf, " ", buflen);
5163 buflen -= diff;
5164 buf += diff;
5165 diff = LabelCopy(buf, title, buflen);
5166 } else {
5167 diff = LabelCopy(buf, title, buflen);
5168 }
5169 */
5170 ttl = title;
5171 pfx = NULL;
5172 if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5173 diff = LabelCopy (buf, pfx, buflen);
5174 buflen -= diff;
5175 buf += diff;
5176 }
5177 diff = LabelCopy (buf, ttl, buflen);
5178 if (organism == NULL && taxname != NULL) {
5179 organism = taxname;
5180 iip = NULL;
5181 }
5182 } else if (!htg_tech) {
5183 if (bsp->repr == Seq_repr_seg) {
5184 title = SimpleSegSeqTitle (bsp);
5185 }
5186 if (title == NULL) {
5187 title = UseOrgMods(bsp, NULL, tech, FALSE);
5188 }
5189 ttl = title;
5190 pfx = NULL;
5191 if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5192 diff = LabelCopy (buf, pfx, buflen);
5193 buflen -= diff;
5194 buf += diff;
5195 }
5196 if (ttl != NULL) {
5197 diff = LabelCopy (buf, ttl, buflen);
5198 } else {
5199 diff = LabelCopy (buf, "No definition line found", buflen);
5200 }
5201 }
5202 }
5203 }
5204 if (title != NULL) {
5205 TrimNonPeriodPunctuationFromEnd (title);
5206 }
5207 buflen -= diff;
5208 buf += diff;
5209 if (htg_tech) {
5210 if (tech == MI_TECH_htgs_0)
5211 phase = 0;
5212 else
5213 phase = (Uint4)(tech - MI_TECH_htgs_1 + 1);
5214 if (title == NULL|| *title == '\0') {
5215 title = UseOrgMods(bsp, NULL, tech, htgs_pooled_multiclone);
5216 organism = NULL;
5217 if (title != NULL) {
5218 ttl = title;
5219 pfx = NULL;
5220 if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5221 diff = LabelCopy (buf, pfx, buflen);
5222 buflen -= diff;
5223 buf += diff;
5224 }
5225 diff = LabelCopy (buf, ttl, buflen);
5226 buflen -= diff;
5227 buf += diff;
5228 }
5229 }
5230 if (phase == 3)
5231 {
5232 if (title) {
5233 if (title && StringStr(title, "complete sequence") == NULL) {
5234 diff = LabelCopy(buf, ", complete sequence", buflen);
5235 buflen -= diff;
5236 buf += diff;
5237 }
5238 }
5239 } else {
5240 doit = FALSE;
5241 if (phase == 0) {
5242 if (StringStr(title, "LOW-PASS") == NULL) {
5243 doit = TRUE;
5244 i = 0;
5245 }
5246 } else {
5247 if (htgs_draft) {
5248 if (StringStr(title, "WORKING DRAFT") == NULL) {
5249 doit = TRUE;
5250 i = 1;
5251 }
5252 } else if (! htgs_cancelled) {
5253 if (StringStr(title, "SEQUENCING IN") == NULL) {
5254 doit = TRUE;
5255 i = 2;
5256 }
5257 }
5258 }
5259 if (doit)
5260 {
5261 if (diff != 0) {
5262 diff = LabelCopy(buf, ", ", buflen);
5263 buflen -= diff;
5264 buf += diff;
5265 }
5266 diff = LabelCopy(buf, htg_phrase[i], buflen);
5267 buflen -= diff;
5268 buf += diff;
5269 }
5270 if ((phase != 0) && (bsp->repr == Seq_repr_delta)) {
5271 if (CountGapsInDeltaSeq(bsp,
5272 &num_segs, &num_gaps, NULL, NULL, NULL, 0))
5273 {
5274 if (num_gaps > 0) {
5275 sprintf(tbuf, ", %ld %s pieces", (long)(num_gaps + 1), htgs[phase - 1]);
5276 } else {
5277 /*
5278 sprintf(tbuf, ", %ld %s piece", (long)(num_gaps + 1), htgs[phase - 1]);
5279 */
5280 }
5281 diff = LabelCopy(buf, tbuf, buflen);
5282 buflen -= diff;
5283 buf += diff;
5284 }
5285 }
5286 else if (phase != 0) {
5287 /*
5288 sprintf(tbuf, ", in %s pieces", htgs[phase-1]);
5289 diff = LabelCopy(buf, tbuf, buflen);
5290 buflen -= diff;
5291 buf += diff;
5292 */
5293 }
5294 }
5295 } else if (tech == MI_TECH_est || tech == MI_TECH_sts || tech == MI_TECH_survey || tech == MI_TECH_wgs) {
5296 if (title == NULL|| *title == '\0') {
5297 title = UseOrgMods(bsp, NULL, tech, FALSE);
5298 organism = NULL;
5299 if (title != NULL) {
5300 ttl = title;
5301 pfx = NULL;
5302 if (DoTpaPrefix (title, &ttl, &pfx, is_tpa, tpa_exp, tpa_inf, is_tsa)) {
5303 diff = LabelCopy (buf, pfx, buflen);
5304 buflen -= diff;
5305 buf += diff;
5306 }
5307 diff = LabelCopy (buf, ttl, buflen);
5308 buflen -= diff;
5309 buf += diff;
5310 }
5311 }
5312 if (tech == MI_TECH_est) {
5313 if (title) {
5314 if (title && StringStr(title, "mRNA sequence") == NULL) {
5315 diff = LabelCopy(buf, ", mRNA sequence", buflen);
5316 buflen -= diff;
5317 buf += diff;
5318 }
5319 }
5320 } else if (tech == MI_TECH_sts) {
5321 if (title) {
5322 if (title && StringStr(title, "sequence tagged site") == NULL) {
5323 diff = LabelCopy(buf, ", sequence tagged site", buflen);
5324 buflen -= diff;
5325 buf += diff;
5326 }
5327 }
5328 } else if (tech == MI_TECH_survey) {
5329 if (title) {
5330 if (title && StringStr(title, "genomic survey sequence") == NULL) {
5331 diff = LabelCopy(buf, ", genomic survey sequence", buflen);
5332 buflen -= diff;
5333 buf += diff;
5334 }
5335 }
5336 } else if (tech == MI_TECH_wgs) {
5337 if (title) {
5338 vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
5339 if (vnp != NULL) {
5340 biop = (BioSourcePtr) vnp->data.ptrvalue;
5341 if (biop != NULL) {
5342 genome = biop->genome;
5343 if (genome < kNumWGSOrganelles) {
5344 orgnl = organelleForWGS [genome];
5345 }
5346 }
5347 }
5348 if (wgsmaster) {
5349 if (title && StringStr (title, "whole genome shotgun sequencing project") == NULL) {
5350 diff = LabelCopy(buf, " whole genome shotgun sequencing project", buflen);
5351 buflen -= diff;
5352 buf += diff;
5353 }
5354 } else if (title && StringStr (title, "whole genome shotgun sequence") == NULL) {
5355 if (orgnl != NULL && StringStr (title, orgnl) == NULL) {
5356 diff = LabelCopy(buf, " ", buflen);
5357 buflen -= diff;
5358 buf += diff;
5359 diff = LabelCopy(buf, orgnl, buflen);
5360 buflen -= diff;
5361 buf += diff;
5362 }
5363 diff = LabelCopy(buf, ", whole genome shotgun sequence", buflen);
5364 buflen -= diff;
5365 buf += diff;
5366 }
5367 }
5368 }
5369 }
5370 if (iip == NULL && organism != NULL) {
5371 doit = TRUE;
5372 if (title) {
5373 if (StringStr(title, organism) != NULL)
5374 doit = FALSE;
5375 }
5376 if (doit)
5377 LabelCopyExtra(buf, organism, buflen, " [", "]");
5378 }
5379 MemFree(title);
5380 return TRUE;
5381 }
CreateDefLineEx(ItemInfoPtr iip,BioseqPtr bsp,CharPtr buf,Uint4 buflen,Uint1 tech,CharPtr accession,CharPtr organism,Boolean ignoreTitle)5382 NLM_EXTERN Boolean CreateDefLineEx (ItemInfoPtr iip, BioseqPtr bsp, CharPtr buf, Uint4 buflen, Uint1 tech,
5383 CharPtr accession, CharPtr organism, Boolean ignoreTitle)
5384 {
5385 return CreateDefLineExEx (iip, bsp, buf, buflen, tech, accession, organism, ignoreTitle, FALSE);
5386 }
CreateDefLine(ItemInfoPtr iip,BioseqPtr bsp,CharPtr buf,Uint4 buflen,Uint1 tech,CharPtr accession,CharPtr organism)5387 NLM_EXTERN Boolean CreateDefLine (ItemInfoPtr iip, BioseqPtr bsp, CharPtr buf, Uint4 buflen,
5388 Uint1 tech, CharPtr accession, CharPtr organism)
5389 {
5390 return CreateDefLineExEx (iip, bsp, buf, buflen, tech, accession, organism, FALSE, FALSE);
5391 }
5392 /*****************************************************************************
5393 *
5394 * FastaSeqPort(bsp, is_na, do_virtual)
5395 * opens a SeqPort for a fasta output of bsp
5396 *
5397 *****************************************************************************/
FastaSeqPort(BioseqPtr bsp,Boolean is_na,Boolean do_virtual,Uint1 code)5398 NLM_EXTERN SeqPortPtr FastaSeqPort(BioseqPtr bsp, Boolean is_na, Boolean do_virtual,
5399 Uint1 code)
5400 {
5401 SeqPortPtr spp = NULL;
5402 if (bsp == NULL) return spp;
5403 spp = SeqPortNew(bsp, 0, -1, 0, code);
5404 if (do_virtual)
5405 SeqPortSet_do_virtual(spp, TRUE);
5406 SeqPortSeek(spp, 0, SEEK_SET);
5407 return spp;
5408 }
5409 /*****************************************************************************
5410 *
5411 * FastaSeqPortEx(bsp, is_na, do_virtual, slp)
5412 * opens a SeqPort for a fasta output of bsp constrained to slp
5413 *
5414 *****************************************************************************/
FastaSeqPortEx(BioseqPtr bsp,Boolean is_na,Boolean do_virtual,Uint1 code,SeqLocPtr slp)5415 NLM_EXTERN SeqPortPtr FastaSeqPortEx(BioseqPtr bsp, Boolean is_na, Boolean do_virtual,
5416 Uint1 code, SeqLocPtr slp)
5417 {
5418 SeqPortPtr spp = NULL;
5419 if (bsp == NULL) return spp;
5420 if (slp == NULL) return FastaSeqPort (bsp, is_na, do_virtual, code);
5421 spp = SeqPortNew(bsp, SeqLocStart(slp), SeqLocStop(slp),
5422 SeqLocStrand(slp), code);
5423 if (do_virtual)
5424 SeqPortSet_do_virtual(spp, TRUE);
5425 SeqPortSeek(spp, 0, SEEK_SET);
5426 return spp;
5427 }
5428 /*****************************************************************************
5429 *
5430 * FastaSeqLine(spp, buf, linelen)
5431 * an open seqport is passed in.
5432 * fills buf with linelen bases
5433 * assumes buf[linelen] = '\0'
5434 * returns FALSE when no more residues to print
5435 *
5436 *****************************************************************************/
FastaSeqLine(SeqPortPtr spp,CharPtr buf,Int2 linelen,Boolean is_na)5437 NLM_EXTERN Boolean FastaSeqLine(SeqPortPtr spp, CharPtr buf, Int2 linelen, Boolean is_na)
5438 {
5439 return FastaSeqLineEx(spp, buf, linelen, is_na, FALSE);
5440 }
FastaSeqLineEx(SeqPortPtr spp,CharPtr buf,Int2 linelen,Boolean is_na,Boolean do_virtual)5441 NLM_EXTERN Boolean FastaSeqLineEx(SeqPortPtr spp, CharPtr buf, Int2 linelen, Boolean is_na, Boolean
5442 do_virtual)
5443 {
5444 Int2 ctr = 0;
5445 Uint1 residue;
5446 Int4 pos;
5447 Char idbuf[128];
5448 if ((spp == NULL) || (buf == NULL)) return FALSE;
5449 while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF)
5450 {
5451 if (! IS_residue(residue))
5452 {
5453 if (residue == INVALID_RESIDUE)
5454 {
5455 if (is_na)
5456 residue = 'N';
5457 else
5458 residue = 'X';
5459 FastaId(spp->bsp, idbuf, 39);
5460 pos = SeqPortTell(spp);
5461 ErrPostEx(SEV_ERROR,0,0, "ToFastA: Invalid residue at position %ld in %s",
5462 (long) pos, idbuf);
5463 }
5464 else
5465 {
5466 if (residue == SEQPORT_VIRT) /* gap */
5467 {
5468 if (ctr) /* got some residues already */
5469 {
5470 buf[ctr] = '\0';
5471 SeqPortSeek(spp, -1, SEEK_CUR); /* back up one */
5472 /* can only seek to a real residue, so go past it */
5473 residue = SeqPortGetResidue(spp);
5474 if (residue == SEQPORT_VIRT)
5475 SeqPortSeek(spp, -1, SEEK_CUR);
5476 return TRUE;
5477 }
5478 else if (! do_virtual) /* first one */
5479 {
5480 buf[ctr] = '-';
5481 buf[ctr + 1] = '\0';
5482 return TRUE;
5483 }
5484 }
5485 residue = '\0';
5486 }
5487 }
5488 if (residue != '\0')
5489 {
5490 buf[ctr] = residue;
5491 ctr++;
5492 if (ctr == linelen)
5493 {
5494 buf[ctr] = '\0';
5495 return TRUE;
5496 }
5497 }
5498 }
5499 buf[ctr] = '\0';
5500 if (ctr)
5501 return TRUE;
5502 else
5503 return FALSE;
5504 }
5505 /*****************************************************************************
5506 *
5507 * NC_Cleanup (entityID, ptr)
5508 * internal function for genome RefSeq processing
5509 *
5510 *****************************************************************************/
RemoveAllTitles(GatherObjectPtr gop)5511 static Boolean RemoveAllTitles (GatherObjectPtr gop)
5512 {
5513 ObjValNodePtr ovp;
5514 SeqDescrPtr sdp;
5515 if (gop == NULL ||
5516 gop->itemtype != OBJ_SEQDESC ||
5517 gop->subtype != Seq_descr_title) return TRUE;
5518 sdp = (SeqDescrPtr) gop->dataptr;
5519 if (sdp == NULL || sdp->extended == 0) return TRUE;
5520 ovp = (ObjValNodePtr) sdp;
5521 ovp->idx.deleteme = TRUE;
5522 return TRUE;
5523 }
AddNcTitles(GatherObjectPtr gop)5524 static Boolean AddNcTitles (GatherObjectPtr gop)
5525 {
5526 BioseqPtr bsp;
5527 Char buf [512];
5528 Boolean is_nc;
5529 /*
5530 MolInfoPtr mip;
5531 SeqDescrPtr sdp;
5532 */
5533 SeqIdPtr sip;
5534 CharPtr str;
5535 TextSeqIdPtr tsip;
5536 if (gop == NULL ||
5537 gop->itemtype != OBJ_BIOSEQ) return TRUE;
5538 bsp = (BioseqPtr) gop->dataptr;
5539 if (bsp == NULL) return TRUE;
5540 is_nc = FALSE;
5541 for (sip = bsp->id; sip != NULL; sip = sip->next) {
5542 if (sip->choice == SEQID_OTHER) {
5543 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
5544 if (tsip != NULL && tsip->accession != NULL) {
5545 if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
5546 is_nc = TRUE;
5547 }
5548 }
5549 }
5550 }
5551 if (! is_nc) return TRUE;
5552 if (NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE)) {
5553 if (! StringHasNoText (buf)) {
5554 str = StringSaveNoNull (buf);
5555 if (str != NULL) {
5556 SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
5557 }
5558 }
5559 }
5560 /*
5561 for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
5562 if (sdp->choice == Seq_descr_molinfo) {
5563 mip = (MolInfoPtr) sdp->data.ptrvalue;
5564 if (mip != NULL &&
5565 mip->biomol == MOLECULE_TYPE_GENOMIC &&
5566 mip->completeness == 1) {
5567 mip->completeness = 0;
5568 }
5569 }
5570 }
5571 */
5572 return TRUE;
5573 }
ClearKeywordsProc(SeqDescrPtr sdp,Pointer userdata)5574 static void ClearKeywordsProc (SeqDescrPtr sdp, Pointer userdata)
5575 {
5576 GBBlockPtr gbp;
5577 ObjValNodePtr ovn;
5578 if (sdp == NULL || sdp->choice != Seq_descr_genbank) return;
5579 gbp = (GBBlockPtr) sdp->data.ptrvalue;
5580 if (gbp == NULL) return;
5581 gbp->keywords = ValNodeFreeData (gbp->keywords);
5582 if (gbp->extra_accessions == NULL && gbp->source == NULL &&
5583 gbp->keywords == NULL && gbp->origin == NULL &&
5584 gbp->date == NULL && gbp->entry_date == NULL &&
5585 gbp->div == NULL && gbp->taxonomy == NULL) {
5586 }
5587 if (sdp->extended == 0) return;
5588 ovn = (ObjValNodePtr) sdp;
5589 ovn->idx.deleteme = TRUE;
5590 }
ClearGenBankKeywords(Uint2 entityID,Pointer ptr)5591 NLM_EXTERN void ClearGenBankKeywords (Uint2 entityID, Pointer ptr)
5592 {
5593 SeqEntryPtr sep;
5594 if (entityID == 0) {
5595 entityID = ObjMgrGetEntityIDForPointer (ptr);
5596 }
5597 if (entityID == 0) return;
5598 sep = GetTopSeqEntryForEntityID (entityID);
5599 VisitDescriptorsInSep (sep, NULL, ClearKeywordsProc);
5600 DeleteMarkedObjects (entityID, 0, NULL);
5601 }
5602
IsNcCallback(BioseqPtr bsp,Pointer userdata)5603 static void IsNcCallback (BioseqPtr bsp, Pointer userdata)
5604
5605 {
5606 BoolPtr is_ncP;
5607 SeqIdPtr sip;
5608 TextSeqIdPtr tsip;
5609
5610 if (bsp == NULL) return;
5611 is_ncP = (BoolPtr) userdata;
5612 if (is_ncP == NULL) return;
5613
5614 for (sip = bsp->id; sip != NULL; sip = sip->next) {
5615 if (sip->choice != SEQID_OTHER) continue;
5616 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
5617 if (tsip == NULL) continue;
5618 if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
5619 *is_ncP = TRUE;
5620 }
5621 }
5622 }
5623
NC_Cleanup(Uint2 entityID,Pointer ptr)5624 NLM_EXTERN void NC_Cleanup (Uint2 entityID, Pointer ptr)
5625 {
5626 Boolean objMgrFilt [OBJ_MAX];
5627 Boolean is_nc = FALSE;
5628 SeqEntryPtr sep;
5629
5630 if (entityID == 0) {
5631 entityID = ObjMgrGetEntityIDForPointer (ptr);
5632 }
5633 if (entityID == 0) return;
5634
5635 sep = GetTopSeqEntryForEntityID (entityID);
5636 VisitBioseqsInSep (sep, (Pointer) &is_nc, IsNcCallback);
5637 if (! is_nc) return;
5638
5639 AssignIDsInEntity (entityID, 0, NULL);
5640 MemSet ((Pointer) objMgrFilt, FALSE, sizeof (objMgrFilt));
5641 objMgrFilt [OBJ_SEQDESC] = TRUE;
5642 GatherObjectsInEntity (entityID, 0, NULL, RemoveAllTitles, NULL, objMgrFilt);
5643 VisitDescriptorsInSep (sep, NULL, ClearKeywordsProc);
5644 DeleteMarkedObjects (entityID, 0, NULL);
5645 MemSet ((Pointer) objMgrFilt, FALSE, sizeof (objMgrFilt));
5646 objMgrFilt [OBJ_BIOSEQ] = TRUE;
5647 GatherObjectsInEntity (entityID, 0, NULL, AddNcTitles, NULL, objMgrFilt);
5648 }
5649
InstantiateNCTitle(Uint2 entityID,Pointer ptr)5650 NLM_EXTERN void InstantiateNCTitle (Uint2 entityID, Pointer ptr)
5651 {
5652 Boolean objMgrFilt [OBJ_MAX];
5653
5654 if (entityID == 0) {
5655 entityID = ObjMgrGetEntityIDForPointer (ptr);
5656 }
5657 if (entityID == 0) return;
5658
5659 AssignIDsInEntity (entityID, 0, NULL);
5660 MemSet ((Pointer) objMgrFilt, FALSE, sizeof (objMgrFilt));
5661 objMgrFilt [OBJ_BIOSEQ] = TRUE;
5662 GatherObjectsInEntity (entityID, 0, NULL, AddNcTitles, NULL, objMgrFilt);
5663 }
5664
AddNmTitles(GatherObjectPtr gop)5665 static Boolean AddNmTitles (GatherObjectPtr gop)
5666 {
5667 BioseqPtr bsp;
5668 Char buf [512];
5669 Boolean is_nm;
5670 SeqIdPtr sip;
5671 CharPtr str;
5672 TextSeqIdPtr tsip;
5673 if (gop == NULL ||
5674 gop->itemtype != OBJ_BIOSEQ) return TRUE;
5675 bsp = (BioseqPtr) gop->dataptr;
5676 if (bsp == NULL) return TRUE;
5677 is_nm = FALSE;
5678 for (sip = bsp->id; sip != NULL; sip = sip->next) {
5679 if (sip->choice == SEQID_OTHER) {
5680 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
5681 if (tsip != NULL && tsip->accession != NULL) {
5682 if (StringNICmp (tsip->accession, "NM_", 3) == 0) {
5683 is_nm = TRUE;
5684 } else if (StringNICmp (tsip->accession, "XM_", 3) == 0) {
5685 is_nm = TRUE;
5686 }
5687 }
5688 }
5689 }
5690 if (! is_nm) return TRUE;
5691 if (NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE)) {
5692 if (! StringHasNoText (buf)) {
5693 str = StringSaveNoNull (buf);
5694 if (str != NULL) {
5695 SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
5696 }
5697 }
5698 }
5699 return TRUE;
5700 }
5701
InstantiateNMTitles(Uint2 entityID,Pointer ptr)5702 NLM_EXTERN void InstantiateNMTitles (Uint2 entityID, Pointer ptr)
5703 {
5704 Boolean objMgrFilt [OBJ_MAX];
5705
5706 if (entityID == 0) {
5707 entityID = ObjMgrGetEntityIDForPointer (ptr);
5708 }
5709 if (entityID == 0) return;
5710
5711 AssignIDsInEntity (entityID, 0, NULL);
5712 MemSet ((Pointer) objMgrFilt, FALSE, sizeof (objMgrFilt));
5713 objMgrFilt [OBJ_BIOSEQ] = TRUE;
5714 GatherObjectsInEntity (entityID, 0, NULL, AddNmTitles, NULL, objMgrFilt);
5715 }
5716
ClearProtTitlesProc(BioseqPtr bsp,Pointer userdata)5717 static void ClearProtTitlesProc (BioseqPtr bsp, Pointer userdata)
5718 {
5719 ObjValNodePtr ovp;
5720 SeqDescrPtr sdp;
5721 SeqIdPtr sip;
5722 if (bsp == NULL) return;
5723 if (! ISA_aa (bsp->mol)) return;
5724 for (sip = bsp->id; sip != NULL; sip = sip->next) {
5725 if (sip->choice == SEQID_OTHER) return;
5726 }
5727 for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
5728 if (sdp->choice == Seq_descr_title) {
5729 if (sdp->extended != 0) {
5730 ovp = (ObjValNodePtr) sdp;
5731 ovp->idx.deleteme = TRUE;
5732 }
5733 }
5734 }
5735 }
ClearProtTitlesNPS(BioseqSetPtr bssp,Pointer userdata)5736 static void ClearProtTitlesNPS (BioseqSetPtr bssp, Pointer userdata)
5737 {
5738 if (bssp->_class != BioseqseqSet_class_nuc_prot) return;
5739 VisitBioseqsInSet (bssp, NULL, ClearProtTitlesProc);
5740 }
ClearProteinTitlesInNucProts(Uint2 entityID,Pointer ptr)5741 NLM_EXTERN void ClearProteinTitlesInNucProts (Uint2 entityID, Pointer ptr)
5742 {
5743 SeqEntryPtr sep;
5744 if (entityID == 0) {
5745 entityID = ObjMgrGetEntityIDForPointer (ptr);
5746 }
5747 if (entityID == 0) return;
5748 sep = GetTopSeqEntryForEntityID (entityID);
5749 VisitSetsInSep (sep, NULL, ClearProtTitlesNPS);
5750 DeleteMarkedObjects (entityID, 0, NULL);
5751 }
5752
5753
AddProtTitles(BioseqPtr bsp,Pointer userdata)5754 static void AddProtTitles (BioseqPtr bsp, Pointer userdata)
5755 {
5756 Char buf [512];
5757 SeqDescrPtr sdp;
5758 SeqIdPtr sip;
5759 CharPtr str;
5760 if (bsp == NULL) return;
5761 if (! ISA_aa (bsp->mol)) return;
5762 for (sip = bsp->id; sip != NULL; sip = sip->next) {
5763 if (sip->choice == SEQID_PIR ||
5764 sip->choice == SEQID_SWISSPROT ||
5765 sip->choice == SEQID_PATENT ||
5766 sip->choice == SEQID_PRF ||
5767 sip->choice == SEQID_PDB) return;
5768 }
5769 for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
5770 if (sdp->choice == Seq_descr_title) return;
5771 }
5772 if (NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE)) {
5773 if (! StringHasNoText (buf)) {
5774 str = StringSaveNoNull (buf);
5775 if (str != NULL) {
5776 SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
5777 }
5778 }
5779 }
5780 }
InstantiateProteinTitles(Uint2 entityID,Pointer ptr)5781 NLM_EXTERN void InstantiateProteinTitles (Uint2 entityID, Pointer ptr)
5782 {
5783 SeqEntryPtr sep;
5784 if (entityID == 0) {
5785 entityID = ObjMgrGetEntityIDForPointer (ptr);
5786 }
5787 if (entityID == 0) return;
5788 AssignIDsInEntity (entityID, 0, NULL);
5789 sep = GetTopSeqEntryForEntityID (entityID);
5790 VisitBioseqsInSep (sep, NULL, AddProtTitles);
5791 }
5792
5793
UpdateProteinTitle(BioseqPtr bsp)5794 NLM_EXTERN void UpdateProteinTitle (BioseqPtr bsp)
5795 {
5796 Char buf [512];
5797 SeqDescrPtr sdp;
5798 ObjValNodePtr ovp;
5799 SeqIdPtr sip;
5800 CharPtr str;
5801
5802 if (bsp == NULL || !ISA_aa (bsp->mol)) {
5803 return;
5804 }
5805
5806 /* we don't create protein titles for these IDs */
5807 for (sip = bsp->id; sip != NULL; sip = sip->next) {
5808 if (sip->choice == SEQID_PIR ||
5809 sip->choice == SEQID_SWISSPROT ||
5810 sip->choice == SEQID_PATENT ||
5811 sip->choice == SEQID_PRF ||
5812 sip->choice == SEQID_PDB) return;
5813 }
5814
5815 sdp = BioseqGetSeqDescr (bsp, Seq_descr_title, NULL);
5816 if (sdp == NULL) {
5817 /* we only update a title if it already exists */
5818 return;
5819 }
5820 if (sdp->extended) {
5821 ovp = (ObjValNodePtr) sdp;
5822 ovp->idx.deleteme = TRUE;
5823 DeleteMarkedObjects (bsp->idx.entityID, OBJ_BIOSEQ, bsp);
5824 }
5825
5826 if (NewCreateDefLineBuf (NULL, bsp, buf, sizeof (buf), FALSE, FALSE)) {
5827 if (! StringHasNoText (buf)) {
5828 str = StringSaveNoNull (buf);
5829 if (str != NULL) {
5830 SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
5831 }
5832 }
5833 }
5834 }
5835
5836
5837 /* NEW DEFLINE GENERATOR */
5838
5839 typedef struct deflinestruct {
5840 /* instance variables */
5841 ItemInfoPtr m_iip;
5842 BioseqPtr m_bioseq;
5843
5844 /* ignore existing title is forced for certain types */
5845 Boolean m_reconstruct;
5846 Boolean m_allprotnames;
5847
5848 Boolean m_gpipemode;
5849 Boolean m_devmode;
5850
5851 /* seq-inst fields */
5852 Boolean m_is_na;
5853 Boolean m_is_aa;
5854
5855 Boolean m_is_seg;
5856 Boolean m_is_delta;
5857 Boolean m_is_virtual;
5858 Boolean m_is_map;
5859 Uint1 m_topology;
5860
5861 /* seq-id fields */
5862 Boolean m_is_nc;
5863 Boolean m_is_nm;
5864 Boolean m_is_nr;
5865 Boolean m_is_patent;
5866 Boolean m_is_pdb;
5867 Boolean m_is_wp;
5868 Boolean m_third_party;
5869 Boolean m_wgs_master;
5870 Boolean m_tsa_master;
5871 Boolean m_tls_master;
5872
5873 CharPtr m_general_str;
5874 CharPtr m_patent_country;
5875 CharPtr m_patent_number;
5876 int m_patent_sequence;
5877
5878 int m_pdb_chain;
5879
5880 /* molinfo fields */
5881 Uint1 m_mi_biomol;
5882 Uint1 m_mi_tech;
5883 Uint1 m_mi_completeness;
5884
5885 Boolean m_htg_tech;
5886 Boolean m_htgs_unfinished;
5887 Boolean m_is_tls;
5888 Boolean m_is_tsa;
5889 Boolean m_is_wgs;
5890 Boolean m_is_est_sts_gss;
5891
5892 Boolean m_use_biosrc;
5893
5894 /* genbank or embl block keyword fields */
5895 Boolean m_htgs_cancelled;
5896 Boolean m_htgs_draft;
5897 Boolean m_htgs_pooled;
5898 Boolean m_tpa_exp;
5899 Boolean m_tpa_inf;
5900 Boolean m_tpa_reasm;
5901 Boolean m_unordered;
5902
5903 /* pdb block fields */
5904 CharPtr m_pdb_compound;
5905
5906 /* biosource fields */
5907 CharPtr m_taxname;
5908 Boolean m_multispecies;
5909 int m_genome;
5910 Boolean m_is_plasmid;
5911 Boolean m_is_chromosome;
5912
5913 CharPtr m_organelle;
5914
5915 CharPtr m_first_super_kingdom;
5916 CharPtr m_second_super_kingdom;
5917 Boolean m_is_cross_kingdom;
5918
5919 /* subsource fields */
5920 CharPtr m_chromosome;
5921 CharPtr m_clone;
5922 Boolean m_has_clone;
5923 CharPtr m_map;
5924 CharPtr m_plasmid;
5925 CharPtr m_segment;
5926
5927 /* orgmod fields */
5928 CharPtr m_breed;
5929 CharPtr m_cultivar;
5930 CharPtr m_isolate;
5931 CharPtr m_strain;
5932
5933 /* map fields */
5934 CharPtr m_enzyme;
5935
5936 /* user object fields */
5937 Boolean m_is_unverified;
5938 CharPtr m_targeted_locus;
5939
5940 /* comment fields */
5941 Boolean m_is_pseudogene;
5942
5943 /* exception fields */
5944 TextFsaPtr m_low_quality_fsa;
5945 } DefLineData, PNTR DefLinePtr;
5946
x_CDShasLowQualityException(DefLinePtr dlp,SeqFeatPtr sfp)5947 static Boolean x_CDShasLowQualityException (
5948 DefLinePtr dlp,
5949 SeqFeatPtr sfp
5950 )
5951
5952 {
5953 Char ch;
5954 TextFsaPtr fsa;
5955 ValNodePtr matches;
5956 CharPtr ptr;
5957 Int4 state;
5958
5959 if (dlp == NULL || sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return FALSE;
5960
5961 if (! sfp->excpt) return FALSE;
5962 if (StringHasNoText (sfp->except_text)) return FALSE;
5963
5964 fsa = dlp->m_low_quality_fsa;
5965 if (fsa == NULL) return FALSE;
5966
5967 state = 0;
5968 matches = NULL;
5969 for (ptr = sfp->except_text, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
5970 state = TextFsaNext (fsa, state, ch, &matches);
5971 if (matches != NULL) {
5972 return TRUE;
5973 }
5974 }
5975
5976 return FALSE;
5977 }
5978
x_OrganelleName(DefLinePtr dlp,Boolean has_plasmid,Boolean virus_or_phage,Boolean wgs_suffix)5979 static CharPtr x_OrganelleName (
5980 DefLinePtr dlp,
5981 Boolean has_plasmid,
5982 Boolean virus_or_phage,
5983 Boolean wgs_suffix
5984 )
5985
5986 {
5987 CharPtr result = NULL;
5988
5989 if (dlp == NULL) return NULL;
5990
5991 switch (dlp->m_genome) {
5992 case GENOME_chloroplast :
5993 result = "chloroplast";
5994 break;
5995 case GENOME_chromoplast :
5996 result = "chromoplast";
5997 break;
5998 case GENOME_kinetoplast :
5999 result = "kinetoplast";
6000 break;
6001 case GENOME_mitochondrion :
6002 {
6003 if (has_plasmid || wgs_suffix) {
6004 result = "mitochondrial";
6005 } else {
6006 result = "mitochondrion";
6007 }
6008 break;
6009 }
6010 case GENOME_plastid :
6011 result = "plastid";
6012 break;
6013 case GENOME_macronuclear :
6014 {
6015 result = "macronuclear";
6016 break;
6017 }
6018 case GENOME_extrachrom :
6019 {
6020 if (! wgs_suffix) {
6021 result = "extrachromosomal";
6022 }
6023 break;
6024 }
6025 case GENOME_plasmid :
6026 {
6027 if (! wgs_suffix) {
6028 result = "plasmid";
6029 }
6030 break;
6031 }
6032 /* transposon and insertion-seq are obsolete */
6033 case GENOME_cyanelle :
6034 result = "cyanelle";
6035 break;
6036 case GENOME_proviral :
6037 {
6038 if (! virus_or_phage) {
6039 if (has_plasmid || wgs_suffix) {
6040 result = "proviral";
6041 } else {
6042 result = "provirus";
6043 }
6044 }
6045 break;
6046 }
6047 case GENOME_virion :
6048 {
6049 if (! virus_or_phage) {
6050 result = "virus";
6051 }
6052 break;
6053 }
6054 case GENOME_nucleomorph :
6055 {
6056 if (! wgs_suffix) {
6057 result = "nucleomorph";
6058 }
6059 break;
6060 }
6061 case GENOME_apicoplast :
6062 result = "apicoplast";
6063 break;
6064 case GENOME_leucoplast :
6065 result = "leucoplast";
6066 break;
6067 case GENOME_proplastid :
6068 result = "proplastid";
6069 break;
6070 case GENOME_endogenous_virus :
6071 result = "endogenous virus";
6072 break;
6073 case GENOME_hydrogenosome :
6074 result = "hydrogenosome";
6075 break;
6076 case GENOME_chromosome :
6077 result = "chromosome";
6078 break;
6079 case GENOME_chromatophore :
6080 result = "chromatophore";
6081 break;
6082 }
6083
6084 return result;
6085 }
6086
6087 /* set instance variables from Seq-inst, Seq-ids, MolInfo, etc., but not BioSource */
x_SetFlags(DefLinePtr dlp)6088 static void x_SetFlags (
6089 DefLinePtr dlp
6090 )
6091
6092 {
6093 BioSourcePtr biop;
6094 BioseqPtr bsp;
6095 IdPatPtr cit;
6096 ValNodePtr compound;
6097 DbtagPtr dbt;
6098 EMBLBlockPtr ebp;
6099 GBBlockPtr gbp;
6100 DbtagPtr general;
6101 ValNodePtr keywords;
6102 size_t len;
6103 MolInfoPtr mip;
6104 Int2 num_super_kingdom = 0;
6105 ObjectIdPtr oip;
6106 OrgNamePtr onp;
6107 OrgRefPtr orp;
6108 PdbBlockPtr pbp;
6109 PDBSeqIdPtr pdbip;
6110 PatentSeqIdPtr psip;
6111 RsiteRefPtr rrp;
6112 SeqDescrPtr sdp;
6113 SeqFeatPtr sfp;
6114 SeqIdPtr sip;
6115 CharPtr str;
6116 Boolean super_kingdoms_different = FALSE;
6117 TaxElementPtr tep;
6118 TextSeqIdPtr tsip;
6119 UserFieldPtr ufp;
6120 UserObjectPtr uop;
6121 ValNodePtr vnp;
6122
6123 if (dlp == NULL) return;
6124
6125 bsp = dlp->m_bioseq;
6126 if (bsp == NULL) return;
6127
6128 dlp->m_is_na = (Boolean) ISA_na (bsp->mol);
6129 dlp->m_is_aa = (Boolean) ISA_aa (bsp->mol);
6130 dlp->m_topology = bsp->topology;
6131
6132 dlp->m_is_seg = (Boolean) (bsp->repr == Seq_repr_seg);
6133 dlp->m_is_delta = (Boolean) (bsp->repr == Seq_repr_delta);
6134 dlp->m_is_virtual = (Boolean) (bsp->repr == Seq_repr_virtual);
6135 dlp->m_is_map = (Boolean) (bsp->repr == Seq_repr_map);
6136
6137 /* process Seq-ids */
6138 for (sip = bsp->id; sip != NULL; sip = sip->next) {
6139 switch (sip->choice) {
6140 case SEQID_OTHER :
6141 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
6142 if (tsip != NULL && tsip->accession != NULL) {
6143 if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
6144 dlp->m_is_nc = TRUE;
6145 } else if (StringNICmp (tsip->accession, "NM_", 3) == 0) {
6146 dlp->m_is_nm = TRUE;
6147 } else if (StringNICmp (tsip->accession, "XM_", 3) == 0) {
6148 dlp->m_is_nm = TRUE;
6149 } else if (StringNICmp (tsip->accession, "NR_", 3) == 0) {
6150 dlp->m_is_nr = TRUE;
6151 } else if (StringNICmp (tsip->accession, "WP_", 3) == 0) {
6152 dlp->m_is_wp = TRUE;
6153 }
6154 len = StringLen (tsip->accession);
6155 if (len == 15) {
6156 if (StringCmp (tsip->accession + 9, "000000") == 0) {
6157 dlp->m_wgs_master = TRUE;
6158 }
6159 } else if (len == 16) {
6160 if (StringCmp (tsip->accession + 9, "0000000") == 0) {
6161 dlp->m_wgs_master = TRUE;
6162 }
6163 } else if (len == 17) {
6164 if (StringCmp (tsip->accession + 10, "0000000") == 0) {
6165 dlp->m_wgs_master = TRUE;
6166 }
6167 }
6168 }
6169 break;
6170 case SEQID_GENBANK :
6171 case SEQID_EMBL :
6172 case SEQID_DDBJ :
6173 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
6174 if (tsip != NULL && tsip->accession != NULL) {
6175 len = StringLen (tsip->accession);
6176 if (len == 12) {
6177 if (StringCmp (tsip->accession + 6, "000000") == 0) {
6178 dlp->m_wgs_master = TRUE;
6179 }
6180 } else if (len == 13) {
6181 if (StringCmp (tsip->accession + 6, "0000000") == 0) {
6182 dlp->m_wgs_master = TRUE;
6183 }
6184 } else if (len == 14) {
6185 if (StringCmp (tsip->accession + 6, "00000000") == 0) {
6186 dlp->m_wgs_master = TRUE;
6187 }
6188 }
6189 }
6190 break;
6191 case SEQID_GENERAL :
6192 dbt = (DbtagPtr) sip->data.ptrvalue;
6193 if (dbt != NULL && (! IsSkippableDbtag (dbt))) {
6194 general = dbt;
6195 if (general != NULL) {
6196 oip = general->tag;
6197 if (oip != NULL) {
6198 if (! StringHasNoText (oip->str)) {
6199 dlp->m_general_str = oip->str;
6200 }
6201 }
6202 }
6203 }
6204 break;
6205 case SEQID_TPG :
6206 case SEQID_TPE :
6207 case SEQID_TPD :
6208 dlp->m_third_party = TRUE;
6209 break;
6210 case SEQID_PDB :
6211 dlp->m_is_pdb = TRUE;
6212 pdbip = (PDBSeqIdPtr) sip->data.ptrvalue;
6213 if (pdbip && pdbip->chain > 32) {
6214 dlp->m_pdb_chain = pdbip->chain;
6215 }
6216 break;
6217 case SEQID_PATENT :
6218 dlp->m_is_patent = TRUE;
6219 psip = (PatentSeqIdPtr) sip->data.ptrvalue;
6220 if (psip != NULL) {
6221 dlp->m_patent_sequence = (int) psip->seqid;
6222 cit = psip->cit;
6223 if (cit != NULL) {
6224 dlp->m_patent_country = cit->country;
6225 if (StringDoesHaveText (cit->number)) {
6226 dlp->m_patent_number = cit->number;
6227 } else if (StringDoesHaveText (cit->app_number)) {
6228 dlp->m_patent_number = cit->app_number;
6229 }
6230 }
6231 }
6232 break;
6233 case SEQID_GPIPE :
6234 break;
6235 default :
6236 break;
6237 }
6238 }
6239
6240 /* process MolInfo tech */
6241 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_molinfo, NULL);
6242 if (sdp != NULL && sdp->choice == Seq_descr_molinfo) {
6243 mip = (MolInfoPtr) sdp->data.ptrvalue;
6244 if (mip != NULL) {
6245 dlp->m_mi_biomol = mip->biomol;
6246 dlp->m_mi_tech = mip->tech;
6247 dlp->m_mi_completeness = mip->completeness;
6248 switch (dlp->m_mi_tech) {
6249 case MI_TECH_htgs_0 :
6250 case MI_TECH_htgs_1 :
6251 case MI_TECH_htgs_2 :
6252 dlp->m_htgs_unfinished = TRUE;
6253 /* manufacture all titles for unfinished HTG sequences */
6254 dlp->m_reconstruct = TRUE;
6255 /* fall through */
6256 case MI_TECH_htgs_3 :
6257 dlp->m_htg_tech = TRUE;
6258 dlp->m_use_biosrc = TRUE;
6259 break;
6260 case MI_TECH_est :
6261 case MI_TECH_sts :
6262 case MI_TECH_survey :
6263 dlp->m_is_est_sts_gss = TRUE;
6264 dlp->m_use_biosrc = TRUE;
6265 break;
6266 case MI_TECH_wgs :
6267 dlp->m_is_wgs = TRUE;
6268 dlp->m_use_biosrc = TRUE;
6269 break;
6270 case MI_TECH_tsa :
6271 dlp->m_is_tsa = TRUE;
6272 dlp->m_use_biosrc = TRUE;
6273 if (dlp->m_is_virtual) {
6274 dlp->m_tsa_master = TRUE;
6275 }
6276 break;
6277 case MI_TECH_targeted :
6278 dlp->m_is_tls = TRUE;
6279 dlp->m_use_biosrc = TRUE;
6280 if (dlp->m_is_virtual) {
6281 dlp->m_tls_master = TRUE;
6282 }
6283 break;
6284 default :
6285 break;
6286 }
6287 }
6288 }
6289
6290 /* process Unverified user object */
6291 for (sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_user, NULL);
6292 sdp != NULL;
6293 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_user, sdp)) {
6294 if (sdp->choice != Seq_descr_user) continue;
6295 uop = (UserObjectPtr) sdp->data.ptrvalue;
6296 if (uop == NULL) continue;
6297 oip = uop->type;
6298 if (oip == NULL) continue;
6299 if (StringICmp (oip->str, "Unverified") == 0) {
6300 dlp->m_is_unverified = TRUE;
6301 } else if (StringICmp (oip->str, "AutodefOptions") == 0) {
6302 for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
6303 oip = ufp->label;
6304 if (oip == NULL) continue;
6305 if (StringICmp (oip->str, "Targeted Locus Name") != 0) continue;
6306 if (ufp->choice != 1) continue;
6307 str = (CharPtr) ufp->data.ptrvalue;
6308 if (StringHasNoText (str)) continue;
6309 dlp->m_targeted_locus = str;
6310 }
6311 }
6312 }
6313
6314 /* process comments */
6315 for (sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_comment, NULL);
6316 sdp != NULL;
6317 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_comment, sdp)) {
6318 if (sdp->choice != Seq_descr_comment) continue;
6319 str = (CharPtr) sdp->data.ptrvalue;
6320 if (str == NULL) continue;
6321 if (StringISearch (str, "[CAUTION] Could be the product of a pseudogene") != 0) {
6322 dlp->m_is_pseudogene = TRUE;
6323 }
6324 }
6325
6326 /* process keywords */
6327 keywords = NULL;
6328
6329 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_genbank, NULL);
6330 if (sdp != NULL && sdp->choice == Seq_descr_genbank) {
6331 gbp = (GBBlockPtr) sdp->data.ptrvalue;
6332 if (gbp != NULL) {
6333 keywords = gbp->keywords;
6334 }
6335 }
6336 if (keywords == NULL) {
6337 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_embl, NULL);
6338 if (sdp != NULL && sdp->choice == Seq_descr_embl) {
6339 ebp = (EMBLBlockPtr) sdp->data.ptrvalue;
6340 if (ebp != NULL) {
6341 keywords = ebp->keywords;
6342 }
6343 }
6344 }
6345 if (keywords != NULL) {
6346 for (vnp = keywords; vnp != NULL; vnp = vnp->next) {
6347 str = (CharPtr) vnp->data.ptrvalue;
6348 if (StringHasNoText (str)) continue;
6349 if (StringICmp (str, "UNORDERED") == 0) {
6350 dlp->m_unordered = TRUE;
6351 }
6352 if (! dlp->m_htg_tech && ! dlp->m_third_party) continue;
6353 if (StringICmp (str, "HTGS_DRAFT") == 0) {
6354 dlp->m_htgs_draft = TRUE;
6355 } else if (StringICmp (str, "HTGS_CANCELLED") == 0) {
6356 dlp->m_htgs_cancelled = TRUE;
6357 } else if (StringICmp (str, "HTGS_POOLED_MULTICLONE") == 0) {
6358 dlp->m_htgs_pooled = TRUE;
6359 } else if (StringICmp (str, "TPA:experimental") == 0) {
6360 dlp->m_tpa_exp = TRUE;
6361 } else if (StringICmp (str, "TPA:inferential") == 0) {
6362 dlp->m_tpa_inf = TRUE;
6363 } else if (StringICmp (str, "TPA:reassembly") == 0) {
6364 dlp->m_tpa_reasm = TRUE;
6365 } else if (StringICmp (str, "TPA:assembly") == 0) {
6366 dlp->m_tpa_reasm = TRUE;
6367 }
6368 }
6369 }
6370
6371 if (dlp->m_is_pdb) {
6372
6373 /* process PDB block */
6374 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_pdb, NULL);
6375 if (sdp != NULL && sdp->choice == Seq_descr_pdb) {
6376 pbp = (PdbBlockPtr) sdp->data.ptrvalue;
6377 if (pbp != NULL) {
6378 compound = pbp->compound;
6379 if (compound != NULL) {
6380 dlp->m_pdb_compound = (CharPtr) compound->data.ptrvalue;
6381 }
6382 }
6383 }
6384 }
6385
6386 if (dlp->m_is_wp) {
6387 for (sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
6388 sdp != NULL;
6389 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, sdp)) {
6390 if (sdp->choice != Seq_descr_source) continue;
6391 biop = (BioSourcePtr) sdp->data.ptrvalue;
6392 if (biop == NULL) continue;
6393 orp = biop->org;
6394 if (orp == NULL) continue;
6395 onp = orp->orgname;
6396 if (onp == NULL) continue;
6397 if (onp->choice != 5) continue;
6398 for (tep = (TaxElementPtr) onp->data; tep != NULL; tep = tep->next) {
6399 if (tep->fixed_level == 0 && StringICmp (tep->level, "superkingdom") == 0) {
6400 num_super_kingdom++;
6401 if (dlp->m_first_super_kingdom == NULL) {
6402 dlp->m_first_super_kingdom = tep->name;
6403 } else if (StringICmp (dlp->m_first_super_kingdom, tep->name) != 0) {
6404 dlp->m_second_super_kingdom = tep->name;
6405 super_kingdoms_different = TRUE;
6406 }
6407 if (num_super_kingdom > 1 && super_kingdoms_different) {
6408 dlp->m_is_cross_kingdom = TRUE;
6409 }
6410 }
6411 }
6412 }
6413 }
6414
6415 if (dlp->m_is_map) {
6416 for (sfp = (SeqFeatPtr) bsp->seq_ext; sfp != NULL; sfp = sfp->next) {
6417 if (sfp->data.choice != SEQFEAT_RSITE) continue;
6418 rrp = (RsiteRefPtr) sfp->data.value.ptrvalue;
6419 if (rrp == NULL) continue;
6420 if (rrp->choice == 1) {
6421 dlp->m_enzyme = (CharPtr) rrp->data.ptrvalue;
6422 }
6423 }
6424 }
6425 }
6426
6427 /* set instance variables from BioSource */
x_SetSrcClone(SeqFeatPtr sfp,Pointer userdata)6428 static void x_SetSrcClone (
6429 SeqFeatPtr sfp,
6430 Pointer userdata
6431 )
6432
6433 {
6434 BioSourcePtr biop;
6435 DefLinePtr dlp;
6436 SubSourcePtr ssp;
6437
6438 if (sfp == NULL || sfp->data.choice != SEQFEAT_BIOSRC) return;
6439 dlp = (DefLinePtr) userdata;
6440 if (dlp == NULL) return;
6441
6442 biop = (BioSourcePtr) sfp->data.value.ptrvalue;
6443 if (biop == NULL) return;
6444
6445 /* look for clones on source features */
6446 for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
6447 if (StringHasNoText (ssp->name)) continue;
6448 if (ssp->subtype != SUBSRC_clone) continue;
6449 dlp->m_has_clone = TRUE;
6450 }
6451 }
6452
x_SetBioSrc(DefLinePtr dlp)6453 static void x_SetBioSrc (
6454 DefLinePtr dlp
6455 )
6456
6457 {
6458 BioSourcePtr biop;
6459 BioseqPtr bsp;
6460 Boolean has_plasmid = FALSE, wgs_suffix = FALSE, virus_or_phage = FALSE;
6461 OrgModPtr omp;
6462 OrgNamePtr onp;
6463 OrgRefPtr orp;
6464 SeqDescrPtr sdp;
6465 SubSourcePtr ssp;
6466 TaxElementPtr tep;
6467
6468 if (dlp == NULL) return;
6469
6470 bsp = dlp->m_bioseq;
6471 if (bsp == NULL) return;
6472
6473 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
6474 if (sdp != NULL && sdp->choice == Seq_descr_source) {
6475 biop = (BioSourcePtr) sdp->data.ptrvalue;
6476 if (biop != NULL) {
6477 orp = biop->org;
6478 if (orp != NULL) {
6479 if (StringDoesHaveText (orp->taxname)) {
6480 dlp->m_taxname = orp->taxname;
6481 }
6482 }
6483 dlp->m_genome = biop->genome;
6484 dlp->m_is_plasmid = (Boolean) (dlp->m_genome == GENOME_plasmid);
6485 dlp->m_is_chromosome = (Boolean) (dlp->m_genome == GENOME_chromosome);
6486
6487 /* process SubSource */
6488 for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
6489 if (StringHasNoText (ssp->name)) continue;
6490 switch (ssp->subtype) {
6491 case SUBSRC_chromosome :
6492 dlp->m_chromosome = ssp->name;
6493 break;
6494 case SUBSRC_clone :
6495 dlp->m_clone = ssp->name;
6496 dlp->m_has_clone = TRUE;
6497 break;
6498 case SUBSRC_map :
6499 dlp->m_map = ssp->name;
6500 break;
6501 case SUBSRC_plasmid_name :
6502 dlp->m_plasmid = ssp->name;
6503 break;
6504 case SUBSRC_segment :
6505 dlp->m_segment = ssp->name;
6506 break;
6507 default :
6508 break;
6509 }
6510 }
6511
6512 /* process OrgMod */
6513 if (orp != NULL) {
6514 onp = orp->orgname;
6515 if (onp != NULL) {
6516 if (onp->choice == 5) {
6517 for (tep = (TaxElementPtr) onp->data; tep != NULL; tep = tep->next) {
6518 if (tep->fixed_level > 0) {
6519 dlp->m_multispecies = TRUE;
6520 } else if (StringDoesHaveText (tep->level) && StringICmp (tep->level, "species") != 0) {
6521 dlp->m_multispecies = TRUE;
6522 }
6523 }
6524 }
6525 for (omp = onp->mod; omp != NULL; omp = omp->next) {
6526 if (StringHasNoText (omp->subname)) continue;
6527 switch (omp->subtype) {
6528 case ORGMOD_strain :
6529 if (StringHasNoText (dlp->m_strain)) {
6530 dlp->m_strain = omp->subname;
6531 }
6532 break;
6533 case ORGMOD_cultivar :
6534 if (StringHasNoText (dlp->m_cultivar)) {
6535 dlp->m_cultivar = omp->subname;
6536 }
6537 break;
6538 case ORGMOD_isolate :
6539 if (StringHasNoText (dlp->m_isolate)) {
6540 dlp->m_isolate = omp->subname;
6541 }
6542 break;
6543 case ORGMOD_breed :
6544 if (StringHasNoText (dlp->m_breed)) {
6545 dlp->m_breed = omp->subname;
6546 }
6547 break;
6548 default :
6549 break;
6550 }
6551 }
6552 }
6553 }
6554 }
6555 }
6556
6557 if (StringISearch (dlp->m_taxname, "virus") != NULL ||
6558 StringISearch (dlp->m_taxname, "phage") != NULL) {
6559 virus_or_phage = TRUE;
6560 }
6561
6562 if (StringDoesHaveText (dlp->m_plasmid)) {
6563 has_plasmid = TRUE;
6564 }
6565
6566 if (dlp->m_is_wgs) {
6567 wgs_suffix = TRUE;
6568 }
6569
6570 dlp->m_organelle = x_OrganelleName (dlp, has_plasmid, virus_or_phage, wgs_suffix);
6571
6572
6573 if (dlp->m_has_clone) return;
6574
6575 VisitFeaturesOnBsp (bsp, (Pointer) dlp, x_SetSrcClone);
6576 }
6577
x_TrimFirstNCharacters(CharPtr str,Int2 count)6578 static CharPtr x_TrimFirstNCharacters (
6579 CharPtr str,
6580 Int2 count
6581 )
6582
6583 {
6584 Uchar ch; /* to use 8bit characters in multibyte languages */
6585 CharPtr dst;
6586 CharPtr ptr;
6587
6588 if (str != NULL && str [0] != '\0') {
6589 dst = str;
6590 ptr = str;
6591 ch = *ptr;
6592 while (ch != '\0' && count > 0) {
6593 count--;
6594 ptr++;
6595 ch = *ptr;
6596 }
6597 while (ch != '\0') {
6598 *dst = ch;
6599 dst++;
6600 ptr++;
6601 ch = *ptr;
6602 }
6603 *dst = '\0';
6604 }
6605 return str;
6606 }
6607
x_TrimPunctuationFromEnd(CharPtr str)6608 static CharPtr x_TrimPunctuationFromEnd (
6609 CharPtr str
6610 )
6611
6612 {
6613 Uchar ch; /* to use 8bit characters in multibyte languages */
6614 CharPtr dst;
6615 CharPtr ptr;
6616
6617 if (str != NULL && str [0] != '\0') {
6618 dst = NULL;
6619 ptr = str;
6620 ch = *ptr;
6621 while (ch != '\0') {
6622 if (ch == ' ' || ch == ';' || ch == ',' || ch == '~' || ch == '.') {
6623 if (dst == NULL) {
6624 dst = ptr;
6625 }
6626 } else {
6627 dst = NULL;
6628 }
6629 ptr++;
6630 ch = *ptr;
6631 }
6632 if (dst != NULL) {
6633 *dst = '\0';
6634 }
6635 }
6636 return str;
6637 }
6638
x_TrimMostPunctFromEnd(CharPtr str)6639 static CharPtr x_TrimMostPunctFromEnd (
6640 CharPtr str
6641 )
6642
6643 {
6644 Uchar ch; /* to use 8bit characters in multibyte languages */
6645 CharPtr dst;
6646 CharPtr ptr;
6647
6648 if (str != NULL && str [0] != '\0') {
6649 dst = NULL;
6650 ptr = str;
6651 ch = *ptr;
6652 while (ch != '\0') {
6653 if (ch == ' ' || ch == ';' || ch == ',' || ch == '~') {
6654 if (dst == NULL) {
6655 dst = ptr;
6656 }
6657 } else {
6658 dst = NULL;
6659 }
6660 ptr++;
6661 ch = *ptr;
6662 }
6663 if (dst != NULL) {
6664 *dst = '\0';
6665 }
6666 }
6667 return str;
6668 }
6669
x_CatenateValNodeStrings(ValNodePtr list)6670 static CharPtr x_CatenateValNodeStrings (
6671 ValNodePtr list
6672 )
6673
6674 {
6675 size_t len;
6676 CharPtr ptr;
6677 CharPtr str;
6678 CharPtr tmp;
6679 ValNodePtr vnp;
6680
6681
6682 ptr = NULL;
6683 if (list != NULL) {
6684 vnp = list;
6685 len = 0;
6686 while (vnp != NULL) {
6687 if (vnp->data.ptrvalue != NULL) {
6688 len += StringLen ((CharPtr) vnp->data.ptrvalue) + 1;
6689 }
6690 vnp = vnp->next;
6691 }
6692 if (len > 0) {
6693 ptr = MemNew (sizeof (Char) * (len + 2));
6694 if (ptr != NULL) {
6695 vnp = list;
6696 tmp = ptr;
6697 while (vnp != NULL) {
6698 str = (CharPtr) vnp->data.ptrvalue;
6699 /* do not use StringDoesHaveText because generalID must be prefixed by space */
6700 if (str != NULL) {
6701 tmp = StringMove (tmp, str);
6702 }
6703 vnp = vnp->next;
6704 }
6705 }
6706 }
6707 }
6708 return ptr;
6709 }
6710
x_DescribeClones(DefLinePtr dlp)6711 static CharPtr x_DescribeClones (
6712 DefLinePtr dlp
6713 )
6714
6715 {
6716 Char buf [128];
6717 Char ch;
6718 Int4 count;
6719 size_t len;
6720 CharPtr result = NULL;
6721 CharPtr str;
6722
6723 if (dlp == NULL) return NULL;
6724
6725 if (dlp->m_htgs_unfinished && dlp->m_htgs_pooled && dlp->m_has_clone) {
6726 result = StringSave (", pooled multiple clones");
6727 return result;
6728 }
6729
6730 str = dlp->m_clone;
6731 if (StringHasNoText (str)) return NULL;
6732
6733 count = 1;
6734 ch = *str;
6735 while (ch != '\0') {
6736 if (ch == ';') {
6737 count++;
6738 }
6739 str++;
6740 ch = *str;
6741 }
6742
6743 if (count > 3) {
6744 sprintf (buf, ", %d clones", (int) count);
6745 result = StringSave (buf);
6746 } else {
6747 len = StringLen (dlp->m_clone) + 20;
6748 result = (CharPtr) MemNew (sizeof (Char) * len);
6749 if (result != NULL) {
6750 StringCat (result, " clone ");
6751 StringCat (result, dlp->m_clone);
6752 }
6753 }
6754
6755 return result;
6756 }
6757
x_EndsWithStrain(DefLinePtr dlp,CharPtr strain)6758 static Boolean x_EndsWithStrain (
6759 DefLinePtr dlp,
6760 CharPtr strain
6761 )
6762
6763 {
6764 Char ch;
6765 size_t len;
6766 CharPtr nxt;
6767 CharPtr ptr;
6768
6769 if (dlp == NULL || strain == NULL) return FALSE;
6770
6771 len = StringLen (strain);
6772 if (len >= StringLen (dlp->m_taxname)) return FALSE;
6773
6774 ptr = StringChr (dlp->m_taxname, ' ');
6775 if (ptr == NULL) return FALSE;
6776 ptr++;
6777 ptr = StringChr (ptr, ' ');
6778 if (ptr == NULL) return FALSE;
6779 ptr++;
6780
6781 ptr = StringISearch (dlp->m_taxname, strain);
6782 if (ptr == NULL) return FALSE;
6783
6784 nxt = StringISearch (ptr + 1, strain);
6785 while (nxt != NULL) {
6786 ptr = nxt;
6787 nxt = StringISearch (ptr + 1, strain);
6788 }
6789
6790 ptr += len;
6791 if (! StringHasNoText (ptr)) {
6792 if (StringCmp (ptr, "'") == 0) {
6793 ptr -= len + 1;
6794 if (*ptr == '\'') return TRUE;
6795 }
6796 return FALSE;
6797 }
6798 ptr -= len + 1;
6799 ch = *ptr;
6800 /*
6801 if (ch == ' ' || ch == '-' || ch == '_' || ch == ':' ||
6802 ch == ';' || ch == '.' || ch == '/') {
6803 return TRUE;
6804 }
6805 */
6806 if (ispunct (ch) || isspace (ch)) {
6807 return TRUE;
6808 }
6809
6810 return FALSE;
6811 }
6812
x_TitleFromBioSrc(DefLinePtr dlp)6813 static CharPtr x_TitleFromBioSrc (
6814 DefLinePtr dlp
6815 )
6816
6817 {
6818 CharPtr result = NULL, cln, stn, ptr;
6819 ValNodePtr strings = NULL;
6820
6821 if (dlp == NULL) return NULL;
6822
6823 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6824
6825 if (StringDoesHaveText (dlp->m_strain)) {
6826 stn = StringSave (dlp->m_strain);
6827 ptr = StringChr (stn, ';');
6828 if (ptr != NULL) {
6829 *ptr = '\0';
6830 }
6831 if (! x_EndsWithStrain (dlp, stn)) {
6832 ValNodeCopyStr (&strings, 0, " strain ");
6833 ValNodeCopyStr (&strings, 0, stn);
6834 }
6835 MemFree (stn);
6836 }
6837
6838 if (StringDoesHaveText (dlp->m_breed)) {
6839 ValNodeCopyStr (&strings, 0, " breed ");
6840 ValNodeCopyStr (&strings, 0, dlp->m_breed);
6841 }
6842
6843 if (StringDoesHaveText (dlp->m_cultivar)) {
6844 ValNodeCopyStr (&strings, 0, " cultivar ");
6845 ValNodeCopyStr (&strings, 0, dlp->m_cultivar);
6846 }
6847
6848 if (StringDoesHaveText (dlp->m_isolate)) {
6849 /* x_EndsWithStrain just checks for supplied pattern, using here for isolate */
6850 if (! x_EndsWithStrain (dlp, dlp->m_isolate)) {
6851 ValNodeCopyStr (&strings, 0, " isolate ");
6852 ValNodeCopyStr (&strings, 0, dlp->m_isolate);
6853 }
6854 }
6855
6856 if (StringDoesHaveText (dlp->m_chromosome)) {
6857 ValNodeCopyStr (&strings, 0, " chromosome ");
6858 ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
6859 }
6860
6861 cln = x_DescribeClones (dlp);
6862 if (StringDoesHaveText (cln)) {
6863 ValNodeCopyStr (&strings, 0, cln);
6864 }
6865 MemFree (cln);
6866
6867 if (StringDoesHaveText (dlp->m_map)) {
6868 ValNodeCopyStr (&strings, 0, " map ");
6869 ValNodeCopyStr (&strings, 0, dlp->m_map);
6870 }
6871
6872 if (StringDoesHaveText (dlp->m_organelle)) {
6873 if (StringCmp (dlp->m_organelle, "chromosome") == 0) {
6874 /*
6875 if (StringHasNoText (dlp->m_chromosome)) {
6876 ValNodeCopyStr (&strings, 0, " ");
6877 ValNodeCopyStr (&strings, 0, dlp->m_organelle);
6878 }
6879 */
6880 } else if (StringCmp (dlp->m_organelle, "plasmid") == 0) {
6881 if (StringHasNoText (dlp->m_plasmid) && StringHasNoText (dlp->m_chromosome)) {
6882 ValNodeCopyStr (&strings, 0, " ");
6883 ValNodeCopyStr (&strings, 0, dlp->m_organelle);
6884 }
6885 } else {
6886 ValNodeCopyStr (&strings, 0, " ");
6887 ValNodeCopyStr (&strings, 0, dlp->m_organelle);
6888 }
6889 }
6890
6891 if (StringDoesHaveText (dlp->m_plasmid)) {
6892 if (StringStr (dlp->m_plasmid, "plasmid") == NULL) {
6893 ValNodeCopyStr (&strings, 0, " plasmid ");
6894 } else {
6895 ValNodeCopyStr (&strings, 0, " ");
6896 }
6897 ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
6898 }
6899
6900 result = x_CatenateValNodeStrings (strings);
6901 ValNodeFreeData (strings);
6902 if (result == NULL) return NULL;
6903
6904 return result;
6905 }
6906
x_LowercasePlasmidOrElement(CharPtr def)6907 static void x_LowercasePlasmidOrElement (
6908 CharPtr def
6909 )
6910
6911 {
6912 CharPtr ptr;
6913
6914 if (StringHasNoText (def)) return;
6915
6916 def++;
6917
6918 ptr = StringISearch (def, "plasmid");
6919 while (ptr != NULL) {
6920 if (*ptr == 'P') {
6921 *ptr = 'p';
6922 }
6923 ptr = StringISearch (ptr + 7, "plasmid");
6924 }
6925
6926 ptr = StringISearch (def, "element");
6927 while (ptr != NULL) {
6928 if (*ptr == 'E') {
6929 *ptr = 'e';
6930 }
6931 ptr = StringISearch (ptr + 7, "element");
6932 }
6933 }
6934
x_TitleFromNC(DefLinePtr dlp)6935 static CharPtr x_TitleFromNC (
6936 DefLinePtr dlp
6937 )
6938
6939 {
6940 CharPtr completeseq = ", complete sequence";
6941 CharPtr completegen = ", complete genome";
6942 CharPtr result = NULL, pls_pfx = "";
6943 ValNodePtr strings = NULL;
6944
6945 if (dlp == NULL) return NULL;
6946
6947 if (dlp->m_mi_biomol != MOLECULE_TYPE_GENOMIC &&
6948 dlp->m_mi_biomol != MOLECULE_TYPE_OTHER_GENETIC_MATERIAL) return NULL;
6949
6950 if (StringHasNoText (dlp->m_taxname)) return NULL;
6951
6952 if (dlp->m_mi_completeness == 2 ||
6953 dlp->m_mi_completeness == 3 ||
6954 dlp->m_mi_completeness == 4 ||
6955 dlp->m_mi_completeness == 5) {
6956 /* remove "complete" component */
6957 completeseq = ", partial sequence";
6958 completegen = ", genome";
6959 }
6960
6961 if (StringDoesHaveText (dlp->m_plasmid)) {
6962 if (StringISearch (dlp->m_plasmid, "plasmid") == NULL &&
6963 StringISearch (dlp->m_plasmid, "element") == NULL) {
6964 pls_pfx = "plasmid ";
6965 }
6966 }
6967
6968 if (StringISearch (dlp->m_taxname, "plasmid") != NULL) {
6969
6970 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6971 ValNodeCopyStr (&strings, 0, completeseq);
6972
6973 } else if (dlp->m_is_plasmid) {
6974
6975 if (StringDoesHaveText (dlp->m_plasmid)) {
6976 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6977 ValNodeCopyStr (&strings, 0, " ");
6978 ValNodeCopyStr (&strings, 0, pls_pfx);
6979 ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
6980 ValNodeCopyStr (&strings, 0, completeseq);
6981 } else {
6982 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6983 ValNodeCopyStr (&strings, 0, " unnamed plasmid");
6984 ValNodeCopyStr (&strings, 0, completeseq);
6985 }
6986
6987 } else if (StringDoesHaveText (dlp->m_plasmid)) {
6988
6989 if (StringDoesHaveText (dlp->m_organelle)) {
6990 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6991 ValNodeCopyStr (&strings, 0, " ");
6992 ValNodeCopyStr (&strings, 0, dlp->m_organelle);
6993 ValNodeCopyStr (&strings, 0, " ");
6994 ValNodeCopyStr (&strings, 0, pls_pfx);
6995 ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
6996 ValNodeCopyStr (&strings, 0, completeseq);
6997 } else {
6998 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
6999 ValNodeCopyStr (&strings, 0, " ");
7000 ValNodeCopyStr (&strings, 0, pls_pfx);
7001 ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
7002 ValNodeCopyStr (&strings, 0, completeseq);
7003 }
7004
7005 } else if (StringDoesHaveText (dlp->m_organelle)) {
7006
7007 if (StringDoesHaveText (dlp->m_chromosome)) {
7008 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7009 if (! dlp->m_is_chromosome) {
7010 ValNodeCopyStr (&strings, 0, " ");
7011 ValNodeCopyStr (&strings, 0, dlp->m_organelle);
7012 }
7013 ValNodeCopyStr (&strings, 0, " chromosome ");
7014 ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
7015 ValNodeCopyStr (&strings, 0, completeseq);
7016 } else {
7017 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7018 switch (dlp->m_genome) {
7019 case GENOME_mitochondrion :
7020 case GENOME_chloroplast :
7021 case GENOME_kinetoplast :
7022 case GENOME_plastid :
7023 case GENOME_apicoplast :
7024 ValNodeCopyStr (&strings, 0, " ");
7025 ValNodeCopyStr (&strings, 0, dlp->m_organelle);
7026 break;
7027 }
7028 ValNodeCopyStr (&strings, 0, completegen);
7029 }
7030
7031 } else if (StringDoesHaveText (dlp->m_segment)) {
7032
7033 if (StringStr (dlp->m_segment, "DNA") == NULL &&
7034 StringStr (dlp->m_segment, "RNA") == NULL &&
7035 StringStr (dlp->m_segment, "segment") == NULL &&
7036 StringStr (dlp->m_segment, "Segment") == NULL) {
7037 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7038 ValNodeCopyStr (&strings, 0, " segment ");
7039 ValNodeCopyStr (&strings, 0, dlp->m_segment);
7040 ValNodeCopyStr (&strings, 0, completegen);
7041 } else {
7042 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7043 ValNodeCopyStr (&strings, 0, " ");
7044 ValNodeCopyStr (&strings, 0, dlp->m_segment);
7045 ValNodeCopyStr (&strings, 0, completegen);
7046 }
7047
7048 } else if (StringDoesHaveText (dlp->m_chromosome)) {
7049
7050 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7051 ValNodeCopyStr (&strings, 0, " chromosome ");
7052 ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
7053 ValNodeCopyStr (&strings, 0, completegen);
7054
7055 } else {
7056
7057 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7058 ValNodeCopyStr (&strings, 0, completegen);
7059 }
7060
7061 result = x_CatenateValNodeStrings (strings);
7062 ValNodeFreeData (strings);
7063 if (result == NULL) return NULL;
7064
7065 x_LowercasePlasmidOrElement (result);
7066
7067 return result;
7068 }
7069
7070 typedef struct nmfeatdata {
7071 SeqFeatPtr gene;
7072 SeqFeatPtr cds;
7073 Int2 numgenes;
7074 Int2 numcds;
7075 Int2 numprots;
7076 } NmFeatData, PNTR NmFeatPtr;
7077
x_FindNMFeats(SeqFeatPtr sfp,Pointer userdata)7078 static void x_FindNMFeats (
7079 SeqFeatPtr sfp,
7080 Pointer userdata
7081 )
7082
7083 {
7084 NmFeatPtr nfp;
7085
7086 if (sfp == NULL) return;
7087 nfp = (NmFeatPtr) userdata;
7088 if (nfp == NULL) return;
7089
7090 switch (sfp->data.choice) {
7091 case SEQFEAT_GENE :
7092 nfp->gene = sfp;
7093 (nfp->numgenes)++;
7094 break;
7095 case SEQFEAT_CDREGION :
7096 nfp->cds = sfp;
7097 (nfp->numcds++);
7098 break;
7099 case SEQFEAT_PROT :
7100 (nfp->numprots)++;
7101 break;
7102 default :
7103 break;
7104 }
7105 }
7106
x_IsFlyCG(CharPtr str)7107 static Boolean x_IsFlyCG (
7108 CharPtr str
7109 )
7110 {
7111 Char ch;
7112
7113 if (StringHasNoText (str)) return FALSE;
7114
7115 ch = *str;
7116 if (ch != 'C') return FALSE;
7117
7118 str++;
7119 ch = *str;
7120 if (ch != 'G') return FALSE;
7121
7122 str++;
7123 ch = *str;
7124 while (IS_DIGIT (ch)) {
7125 str++;
7126 ch = *str;
7127 }
7128 if (ch != '-') return FALSE;
7129
7130 str++;
7131 ch = *str;
7132 if (ch != 'P') return FALSE;
7133
7134 str++;
7135 ch = *str;
7136 if (IS_ALPHA (ch)) {
7137 str++;
7138 ch = *str;
7139 if (ch == '\0' || ch == ' ' || ch == ',' || ch == ';') return TRUE;
7140 }
7141
7142 return FALSE;
7143 }
7144
x_FlyCG_PtoR(CharPtr str)7145 static void x_FlyCG_PtoR (
7146 CharPtr str
7147 )
7148
7149 {
7150 Char ch;
7151 CharPtr ptr;
7152
7153 while (StringDoesHaveText (str)) {
7154 ch = *str;
7155 while (IS_WHITESP (ch)) {
7156 str++;
7157 ch = *str;
7158 }
7159 if (x_IsFlyCG (str)) {
7160 ptr = StringStr (str, "-P");
7161 if (ptr != NULL) {
7162 ptr [1] = 'R';
7163 return;
7164 }
7165 }
7166 while (ch != '\0' && (! IS_WHITESP (ch))) {
7167 str++;
7168 ch = *str;
7169 }
7170 }
7171 }
7172
x_TitleFromNM(DefLinePtr dlp)7173 static CharPtr x_TitleFromNM (
7174 DefLinePtr dlp
7175 )
7176
7177 {
7178 Char buf [512], buf2 [600];
7179 CharPtr cds = NULL, gene = NULL, ptr, result = NULL;
7180 Uint2 entityID;
7181 size_t len;
7182 NmFeatData nfd;
7183 SeqEntryPtr sep;
7184
7185 if (dlp == NULL) return NULL;
7186
7187 if (StringHasNoText (dlp->m_taxname)) return NULL;
7188
7189 MemSet ((Pointer) &nfd, 0, sizeof (NmFeatData));
7190
7191 entityID = ObjMgrGetEntityIDForPointer (dlp->m_bioseq);
7192 sep = GetBestTopParentForDataEx (entityID, dlp->m_bioseq, TRUE);
7193
7194 VisitFeaturesInSep (sep, (Pointer) &nfd, x_FindNMFeats);
7195 if (nfd.numgenes != 1 || nfd.numcds != 1 || nfd.numprots < 1) return NULL;
7196
7197 FeatDefLabel (nfd.gene, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
7198 gene = StringSaveNoNull (buf);
7199
7200 FeatDefLabel (nfd.cds, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
7201
7202 /* special case Drosophila RefSeq NM titles */
7203 if (StringICmp (dlp->m_taxname, "Drosophila melanogaster") == 0) {
7204 x_FlyCG_PtoR (buf);
7205 }
7206 ptr = StringStr (buf, "isoform ");
7207 if (ptr != NULL) {
7208 *ptr = '\0';
7209 ptr += 8;
7210 StringCpy (buf2, buf);
7211 StringCat (buf2, "transcript variant ");
7212 StringCat (buf2, ptr);
7213 cds = StringSaveNoNull (buf2);
7214 } else {
7215 cds = StringSaveNoNull (buf);
7216 }
7217
7218 len = StringLen (dlp->m_taxname) + StringLen (cds) +
7219 StringLen (gene) + StringLen (" (), mRNA") + 10;
7220
7221 result = (CharPtr) MemNew (sizeof (Char) * len);
7222
7223 if (result != NULL) {
7224 sprintf (result, "%s %s (%s), mRNA", dlp->m_taxname, cds, gene);
7225 }
7226
7227 MemFree (gene);
7228 MemFree (cds);
7229
7230 return result;
7231 }
7232
x_TitleFromNR(DefLinePtr dlp)7233 static CharPtr x_TitleFromNR (
7234 DefLinePtr dlp
7235 )
7236
7237 {
7238 Char buf [512];
7239 Uint2 entityID;
7240 CharPtr gene = NULL, rna = "miscRNA", result = NULL;
7241 size_t len;
7242 NmFeatData nfd;
7243 SeqEntryPtr sep;
7244
7245 if (dlp == NULL) return NULL;
7246
7247 if (StringHasNoText (dlp->m_taxname)) return NULL;
7248
7249 MemSet ((Pointer) &nfd, 0, sizeof (NmFeatData));
7250
7251 entityID = ObjMgrGetEntityIDForPointer (dlp->m_bioseq);
7252 sep = GetBestTopParentForDataEx (entityID, dlp->m_bioseq, TRUE);
7253
7254 VisitFeaturesInSep (sep, (Pointer) &nfd, x_FindNMFeats);
7255 if (nfd.numgenes < 1) return NULL;
7256
7257 FeatDefLabel (nfd.gene, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
7258 gene = StringSaveNoNull (buf);
7259
7260 switch (dlp->m_mi_biomol) {
7261 case MOLECULE_TYPE_PRE_MRNA :
7262 rna = "precursorRNA";
7263 break;
7264 case MOLECULE_TYPE_MRNA :
7265 rna = "mRNA";
7266 break;
7267 case MOLECULE_TYPE_RRNA :
7268 rna = "rRNA";
7269 break;
7270 case MOLECULE_TYPE_TRNA :
7271 rna = "tRNA";
7272 break;
7273 case MOLECULE_TYPE_SNRNA :
7274 rna = "snRNA";
7275 break;
7276 case MOLECULE_TYPE_SCRNA :
7277 rna = "scRNA";
7278 break;
7279 case MOLECULE_TYPE_CRNA :
7280 rna = "cRNA";
7281 break;
7282 case MOLECULE_TYPE_SNORNA :
7283 rna = "snoRNA";
7284 break;
7285 case MOLECULE_TYPE_TRANSCRIBED_RNA :
7286 rna = "miscRNA";
7287 break;
7288 case MOLECULE_TYPE_NCRNA :
7289 rna = "ncRNA";
7290 break;
7291 case MOLECULE_TYPE_TMRNA :
7292 rna = "tmRNA";
7293 break;
7294 default :
7295 break;
7296 }
7297
7298 len = StringLen (dlp->m_taxname) + StringLen (gene) +
7299 StringLen (", ") + 30;
7300
7301 result = (CharPtr) MemNew (sizeof (Char) * len);
7302 if (result != NULL) {
7303 sprintf (result, "%s %s, %s", dlp->m_taxname, gene, rna);
7304 }
7305
7306 MemFree (gene);
7307
7308 return result;
7309 }
7310
x_TitleFromPatent(DefLinePtr dlp)7311 static CharPtr x_TitleFromPatent (
7312 DefLinePtr dlp
7313 )
7314
7315 {
7316 Char buf [128];
7317
7318 if (dlp == NULL) return NULL;
7319
7320 sprintf (buf, "Sequence %d from Patent %s %s",
7321 (int) dlp->m_patent_sequence,
7322 dlp->m_patent_country,
7323 dlp->m_patent_number);
7324
7325 return StringSave (buf);
7326 }
7327
x_TitleFromPDB(DefLinePtr dlp)7328 static CharPtr x_TitleFromPDB (
7329 DefLinePtr dlp
7330 )
7331
7332 {
7333 Char buf [128];
7334 Char ch;
7335 CharPtr result = NULL;
7336 ValNodePtr strings = NULL;
7337
7338 if (dlp == NULL) return NULL;
7339
7340 ch = dlp->m_pdb_chain;
7341 if (IS_PRINT (ch)) {
7342 sprintf (buf, "Chain %c, ", ch);
7343 ValNodeCopyStr (&strings, 0, buf);
7344 }
7345 ValNodeCopyStr (&strings, 0, dlp->m_pdb_compound);
7346
7347 result = x_CatenateValNodeStrings (strings);
7348 ValNodeFreeData (strings);
7349
7350 return result;
7351 }
7352
x_TitleFromGPipe(DefLinePtr dlp)7353 static CharPtr x_TitleFromGPipe (
7354 DefLinePtr dlp
7355 )
7356
7357 {
7358 CharPtr result = NULL, cln, stn, ptr;
7359 ValNodePtr strings = NULL;
7360
7361 if (dlp == NULL) return NULL;
7362
7363 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
7364
7365 if (StringDoesHaveText (dlp->m_organelle) && StringICmp (dlp->m_organelle, "plasmid") != 0) {
7366 ValNodeCopyStr (&strings, 0, " ");
7367 ValNodeCopyStr (&strings, 0, dlp->m_organelle);
7368 }
7369
7370 if (StringDoesHaveText (dlp->m_strain)) {
7371 stn = StringSave (dlp->m_strain);
7372 ptr = StringChr (stn, ';');
7373 if (ptr != NULL) {
7374 *ptr = '\0';
7375 }
7376 if (! x_EndsWithStrain (dlp, stn)) {
7377 ValNodeCopyStr (&strings, 0, " strain ");
7378 ValNodeCopyStr (&strings, 0, stn);
7379 }
7380 MemFree (stn);
7381 }
7382
7383 if (StringDoesHaveText (dlp->m_chromosome)) {
7384 ValNodeCopyStr (&strings, 0, " chromosome ");
7385 ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
7386 }
7387
7388 cln = x_DescribeClones (dlp);
7389 if (StringDoesHaveText (cln)) {
7390 ValNodeCopyStr (&strings, 0, cln);
7391 }
7392 MemFree (cln);
7393
7394 if (StringDoesHaveText (dlp->m_map)) {
7395 ValNodeCopyStr (&strings, 0, " map ");
7396 ValNodeCopyStr (&strings, 0, dlp->m_map);
7397 }
7398
7399 if (StringDoesHaveText (dlp->m_plasmid)) {
7400 ValNodeCopyStr (&strings, 0, " plasmid ");
7401 ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
7402 }
7403
7404 if (dlp->m_mi_completeness == 1) {
7405 ValNodeCopyStr (&strings, 0, ", complete sequence");
7406 }
7407
7408 result = x_CatenateValNodeStrings (strings);
7409 ValNodeFreeData (strings);
7410 if (result == NULL) return NULL;
7411
7412 return result;
7413 }
7414
7415 typedef struct udxfeatdata {
7416 SeqIdPtr bspid;
7417 Int4 longest;
7418 Uint1 processed;
7419 SeqFeatPtr sfp;
7420 } UdxFeatData, PNTR UdxFeatPtr;
7421
x_GetLongestProtFeat(SeqFeatPtr sfp,Pointer userdata)7422 static void x_GetLongestProtFeat (
7423 SeqFeatPtr sfp,
7424 Pointer userdata
7425 )
7426
7427 {
7428 Int4 len;
7429 ProtRefPtr prp;
7430 SeqIdPtr sip;
7431 UdxFeatPtr ufp;
7432
7433 if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT) return;
7434 prp = (ProtRefPtr) sfp->data.value.ptrvalue;
7435 if (prp == NULL) return;
7436
7437 ufp = (UdxFeatPtr) userdata;
7438 if (ufp == NULL) return;
7439
7440 sip = SeqLocId (sfp->location);
7441 if (sip == NULL) return;
7442
7443 if (! SeqIdIn (sip, ufp->bspid)) return;
7444 len = SeqLocLen (sfp->location);
7445 if (len == -1) return;
7446
7447 if (len > ufp->longest) {
7448 ufp->sfp = sfp;
7449 ufp->longest = len;
7450 ufp->processed = prp->processed;
7451 } else if (len == ufp->longest) {
7452 /* unprocessed 0 preferred over preprotein 1 preferred over mat peptide 2 */
7453 if (prp->processed < ufp->processed) {
7454 ufp->sfp = sfp;
7455 ufp->longest = len;
7456 ufp->processed = prp->processed;
7457 }
7458 }
7459 }
7460
x_GetLongestProteinUnindexed(BioseqPtr bsp)7461 static SeqFeatPtr x_GetLongestProteinUnindexed (
7462 BioseqPtr bsp
7463 )
7464
7465 {
7466 BioseqSetPtr bssp = NULL;
7467 UdxFeatData ufd;
7468
7469 if (bsp == NULL) return NULL;
7470
7471 MemSet ((Pointer) &ufd, 0, sizeof (UdxFeatData));
7472 ufd.bspid = bsp->id;
7473 ufd.longest = 0;
7474 ufd.sfp = NULL;
7475
7476 VisitFeaturesOnBsp (bsp, (Pointer) &ufd, x_GetLongestProtFeat);
7477
7478 if (ufd.sfp != NULL && ufd.longest == bsp->length) return ufd.sfp;
7479
7480 if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
7481 bssp = (BioseqSetPtr) bsp->idx.parentptr;
7482 }
7483
7484 if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) {
7485 VisitFeaturesOnSet (bssp, (Pointer) &ufd, x_GetLongestProtFeat);
7486
7487 if (bssp->idx.parenttype == OBJ_BIOSEQSET) {
7488 bssp = (BioseqSetPtr) bssp->idx.parentptr;
7489 }
7490 }
7491
7492 if (bssp != NULL && bssp->_class == BioseqseqSet_class_segset) {
7493 VisitFeaturesOnSet (bssp, (Pointer) &ufd, x_GetLongestProtFeat);
7494 }
7495
7496 return ufd.sfp;
7497 }
7498
x_NotSpecialTaxName(CharPtr taxname)7499 static Boolean x_NotSpecialTaxName (
7500 CharPtr taxname
7501 )
7502
7503 {
7504 if (StringHasNoText (taxname)) return TRUE;
7505
7506 if (StringICmp (taxname, "synthetic construct") == 0) return FALSE;
7507 if (StringICmp (taxname, "artificial sequence") == 0) return FALSE;
7508 if (StringStr (taxname, "vector") != NULL) return FALSE;
7509 if (StringStr (taxname, "Vector") != NULL) return FALSE;
7510
7511 return TRUE;
7512 }
7513
7514 /*
7515 static CharPtr proteinOrganellePrefix [] = {
7516 NULL,
7517 NULL,
7518 "chloroplast",
7519 "chromoplast",
7520 "kinetoplast",
7521 "mitochondrion",
7522 "plastid",
7523 "macronuclear",
7524 "extrachromosomal",
7525 "plasmid",
7526 NULL,
7527 NULL,
7528 "cyanelle",
7529 "proviral",
7530 "virus",
7531 "nucleomorph",
7532 "apicoplast",
7533 "leucoplast",
7534 "protoplast",
7535 "endogenous virus",
7536 "hydrogenosome",
7537 "chromosome",
7538 "chromatophore"
7539 };
7540 */
7541
7542 static CharPtr proteinOrganellePrefix [] = {
7543 NULL,
7544 NULL,
7545 "chloroplast",
7546 "chromoplast",
7547 "kinetoplast",
7548 "mitochondrion",
7549 "plastid",
7550 "macronuclear",
7551 NULL,
7552 "plasmid",
7553 NULL,
7554 NULL,
7555 "cyanelle",
7556 NULL,
7557 NULL,
7558 "nucleomorph",
7559 "apicoplast",
7560 "leucoplast",
7561 "protoplast",
7562 "endogenous virus",
7563 "hydrogenosome",
7564 NULL,
7565 "chromatophore"
7566 };
7567
x_TitleFromProtein(DefLinePtr dlp)7568 static CharPtr x_TitleFromProtein (
7569 DefLinePtr dlp
7570 )
7571
7572 {
7573 BioSourcePtr biop;
7574 BioseqPtr bsp;
7575 SeqFeatPtr cds = NULL;
7576 Char ch;
7577 CharPtr comma = NULL;
7578 Uint2 entityID;
7579 SeqMgrFeatContext fcontext;
7580 GeneRefPtr grp;
7581 Boolean indexed;
7582 CharPtr isoform = NULL;
7583 size_t len;
7584 CharPtr low_qual = "LOW QUALITY PROTEIN: ";
7585 Int2 offset = 0;
7586 CharPtr organelle = NULL;
7587 OrgRefPtr orp;
7588 Boolean partial = FALSE;
7589 CharPtr prefix = "";
7590 ProtRefPtr prp;
7591 CharPtr ptr;
7592 CharPtr result = NULL;
7593 SeqFeatPtr sfp = NULL;
7594 SeqIntPtr sintp;
7595 SeqLocPtr slp, slpx;
7596 SeqPntPtr spp;
7597 CharPtr str;
7598 ValNodePtr strings = NULL;
7599 CharPtr taxname = NULL;
7600 CharPtr title = NULL;
7601 CharPtr tmp;
7602 ValNodePtr vnp;
7603
7604 if (dlp == NULL) return NULL;
7605
7606 bsp = dlp->m_bioseq;
7607 if (bsp == NULL) return NULL;
7608
7609 entityID = ObjMgrGetEntityIDForPointer (bsp);
7610 indexed = (Boolean) (SeqMgrFeaturesAreIndexed (entityID) != 0);
7611
7612 if (indexed) {
7613 sfp = SeqMgrGetBestProteinFeature (bsp, NULL);
7614 } else {
7615 if (dlp->m_is_seg) {
7616 SeqMgrIndexFeatures (entityID, NULL);
7617 indexed = TRUE;
7618 sfp = SeqMgrGetBestProteinFeature (bsp, NULL);
7619 } else {
7620 sfp = x_GetLongestProteinUnindexed (bsp);
7621 }
7622 }
7623
7624 if (dlp->m_mi_completeness > 1 && dlp->m_mi_completeness < 6) {
7625 partial = TRUE;
7626 }
7627
7628 if (sfp != NULL) {
7629 prp = (ProtRefPtr) sfp->data.value.ptrvalue;
7630 if (prp != NULL) {
7631 if (prp->name != NULL) {
7632 if (dlp->m_allprotnames) {
7633 for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
7634 str = (CharPtr) vnp->data.ptrvalue;
7635 ValNodeCopyStr (&strings, 0, prefix);
7636 ValNodeCopyStr (&strings, 0, str);
7637 prefix = "; ";
7638 }
7639 title = x_CatenateValNodeStrings (strings);
7640 strings = ValNodeFreeData (strings);
7641 } else {
7642 vnp = prp->name;
7643 /* although vnp should not be NULL, a compiler/optimizer bug might let it, so check again */
7644 if (vnp != NULL && vnp->data.ptrvalue != NULL) {
7645 str = (CharPtr) vnp->data.ptrvalue;
7646 title = StringSave (str);
7647 }
7648 }
7649 x_TrimPunctuationFromEnd (title);
7650
7651 /* if hypothetical protein, append locus_tag */
7652 offset = 0;
7653 if (StringNICmp (title, "hypothetical protein", 20) == 0) {
7654 offset = 20;
7655 } else if (StringNICmp (title, "uncharacterized protein", 23) == 0) {
7656 offset = 23;
7657 }
7658 if (offset > 0) {
7659 ptr = title + offset;
7660 if (ptr [0] == ',' && ptr [1] == ' ') {
7661 comma = ",";
7662 ptr += 2;
7663 }
7664 if (ptr [0] == ' ') {
7665 ptr++;
7666 }
7667 if (StringNCmp (ptr, "isoform ", 8) == 0) {
7668 ptr += 8;
7669 isoform = ptr;
7670 ch = *ptr;
7671 while (ch != '\0' && IS_ALPHANUM (ch)) {
7672 ptr++;
7673 ch = *ptr;
7674 }
7675 if (ch != '\0') {
7676 isoform = NULL;
7677 } else {
7678 title [offset] = '\0';
7679 }
7680 }
7681 }
7682 if (StringICmp (title, "hypothetical protein") == 0 || StringICmp (title, "uncharacterized protein") == 0) {
7683 if (! indexed) {
7684 SeqMgrIndexFeatures (entityID, NULL);
7685 indexed = TRUE;
7686 }
7687 if (cds == NULL) {
7688 cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7689 }
7690 if (cds != NULL) {
7691 grp = SeqMgrGetGeneXref (cds);
7692 if (grp == NULL) {
7693 sfp = SeqMgrGetOverlappingFeature (cds->location, FEATDEF_GENE, NULL, 0, NULL, LOCATION_SUBSET, NULL);
7694 if (sfp != NULL) {
7695 grp = (GeneRefPtr) sfp->data.value.ptrvalue;
7696 }
7697 }
7698 if (grp != NULL) {
7699 if (grp->locus_tag != NULL) {
7700 len = StringLen (title) + StringLen (grp->locus_tag) + StringLen (isoform) + 35;
7701 str = (CharPtr) MemNew (sizeof (Char) * len);
7702 if (str != NULL) {
7703 StringCat (str, title);
7704 StringCat (str, " ");
7705 StringCat (str, grp->locus_tag);
7706 if (StringDoesHaveText (isoform)) {
7707 if (comma != NULL) {
7708 StringCat (str, comma);
7709 }
7710 StringCat (str, " isoform ");
7711 StringCat (str, isoform);
7712 }
7713 MemFree (title);
7714 title = str;
7715 }
7716 }
7717 }
7718 }
7719 }
7720 }
7721
7722 if ( title == NULL && prp->desc != NULL) {
7723 title = StringSave (prp->desc);
7724 }
7725
7726 if ( title == NULL && prp->activity != NULL) {
7727 vnp = prp->activity;
7728 str = (CharPtr) vnp->data.ptrvalue;
7729 title = StringSave (str);
7730 }
7731 }
7732 }
7733
7734 if (title == NULL) {
7735 if (! indexed) {
7736 SeqMgrIndexFeatures (entityID, NULL);
7737 indexed = TRUE;
7738 }
7739 if (cds == NULL) {
7740 cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7741 }
7742 if (cds != NULL) {
7743 grp = SeqMgrGetGeneXref (cds);
7744 if (grp == NULL) {
7745 sfp = SeqMgrGetOverlappingFeature (cds->location, FEATDEF_GENE, NULL, 0, NULL, LOCATION_SUBSET, NULL);
7746 if (sfp != NULL) {
7747 grp = (GeneRefPtr) sfp->data.value.ptrvalue;
7748 }
7749 }
7750 if (grp != NULL) {
7751 str = NULL;
7752 if (grp->locus != NULL) {
7753 str = grp->locus;
7754 } else if (grp->syn != NULL) {
7755 vnp = grp->syn;
7756 str = (CharPtr) vnp->data.ptrvalue;
7757 } else if (grp->desc != NULL) {
7758 str = grp->desc;
7759 }
7760 if (StringDoesHaveText (str)) {
7761 ValNodeCopyStr (&strings, 0, str);
7762 ValNodeCopyStr (&strings, 0, " gene product");
7763 title = x_CatenateValNodeStrings (strings);
7764 strings = ValNodeFreeData (strings);
7765 }
7766 }
7767 }
7768 }
7769
7770 if (title == NULL) {
7771 title = StringSave ("unnamed protein product");
7772 if (! indexed) {
7773 SeqMgrIndexFeatures (entityID, NULL);
7774 indexed = TRUE;
7775 }
7776 if (cds == NULL) {
7777 cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7778 }
7779 if (cds != NULL) {
7780 grp = SeqMgrGetGeneXref (cds);
7781 if (grp == NULL) {
7782 sfp = SeqMgrGetOverlappingFeature (cds->location, FEATDEF_GENE, NULL, 0, NULL, LOCATION_SUBSET, NULL);
7783 if (sfp != NULL) {
7784 grp = (GeneRefPtr) sfp->data.value.ptrvalue;
7785 }
7786 }
7787 if (grp != NULL) {
7788 if (grp->locus_tag != NULL) {
7789 len = StringLen (title) + StringLen (grp->locus_tag) + 20;
7790 str = (CharPtr) MemNew (sizeof (Char) * len);
7791 if (str != NULL) {
7792 StringCat (str, title);
7793 StringCat (str, " ");
7794 StringCat (str, grp->locus_tag);
7795 MemFree (title);
7796 title = str;
7797 }
7798 }
7799 }
7800 }
7801 }
7802
7803 if (title != NULL) {
7804 x_TrimPunctuationFromEnd (title);
7805 }
7806
7807 taxname = dlp->m_taxname;
7808 if (StringHasNoText (taxname) || x_NotSpecialTaxName (taxname)) {
7809 if (! indexed) {
7810 SeqMgrIndexFeatures (entityID, NULL);
7811 indexed = TRUE;
7812 }
7813 if (cds == NULL) {
7814 cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7815 }
7816 if (cds != NULL) {
7817 slp = AsnIoMemCopy ((Pointer) cds->location, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
7818 if (slp != NULL) {
7819 for (slpx = SeqLocFindNext (slp, NULL); slpx != NULL; slpx = SeqLocFindNext (slp, slpx)) {
7820 if (slpx->choice == SEQLOC_INT) {
7821 sintp = (SeqIntPtr) slpx->data.ptrvalue;
7822 if (sintp != NULL) {
7823 sintp->strand = Seq_strand_both;
7824 }
7825 } else if (slpx->choice == SEQLOC_PNT) {
7826 spp = (SeqPntPtr) slpx->data.ptrvalue;
7827 if (spp != NULL) {
7828 spp->strand = Seq_strand_both;
7829 }
7830 }
7831 }
7832 /*
7833 sfp = SeqMgrGetOverlappingSource (slp, &fcontext);
7834 */
7835 sfp = SeqMgrGetOverlappingFeature (slp, FEATDEF_BIOSRC, NULL, 0, NULL, LOCATION_SUBSET, &fcontext);
7836 if (sfp != NULL) {
7837 biop = (BioSourcePtr) sfp->data.value.ptrvalue;
7838 if (biop != NULL) {
7839 orp = biop->org;
7840 if (orp != NULL) {
7841 taxname = orp->taxname;
7842 }
7843 }
7844 }
7845 SeqLocFree (slp);
7846 }
7847 }
7848 }
7849
7850 if (dlp->m_genome >= GENOME_chloroplast && dlp->m_genome <= GENOME_chromatophore) {
7851 organelle = proteinOrganellePrefix [dlp->m_genome];
7852 /*
7853 if (StringNICmp (organelle, taxname, StringLen (organelle)) == 0) {
7854 organelle = NULL;
7855 }
7856 */
7857 }
7858
7859 if (cds == NULL) {
7860 if (! indexed) {
7861 SeqMgrIndexFeatures (entityID, NULL);
7862 indexed = TRUE;
7863 }
7864 cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
7865 }
7866 if (cds != NULL) {
7867 if (x_CDShasLowQualityException (dlp, cds)) {
7868 if (StringStr (title, low_qual) == NULL) {
7869 len = StringLen (title) + StringLen (low_qual) + 6;
7870 tmp = (CharPtr) MemNew (sizeof (Char) * len);
7871 if (tmp != NULL) {
7872 StringCat (tmp, low_qual);
7873 StringCat (tmp, title);
7874 MemFree (title);
7875 title = tmp;
7876 }
7877 }
7878 }
7879 }
7880
7881 if (partial) {
7882 len = StringLen (title) + 12;
7883 tmp = (CharPtr) MemNew (sizeof (Char) * len);
7884 if (tmp != NULL) {
7885 StringCat (tmp, title);
7886 StringCat (tmp, ", partial");
7887 MemFree (title);
7888 title = tmp;
7889 }
7890 }
7891 if (StringDoesHaveText (organelle)) {
7892 len = StringLen (title) + StringLen (organelle) + 6;
7893 tmp = (CharPtr) MemNew (sizeof (Char) * len);
7894 if (tmp != NULL) {
7895 StringCat (tmp, title);
7896 StringCat (tmp, " (");
7897 StringCat (tmp, organelle);
7898 StringCat (tmp, ")");
7899 MemFree (title);
7900 title = tmp;
7901 }
7902 }
7903
7904 if (dlp->m_is_cross_kingdom && StringDoesHaveText (dlp->m_first_super_kingdom) && StringDoesHaveText (dlp->m_second_super_kingdom)) {
7905 len = StringLen (title) + StringLen (dlp->m_first_super_kingdom) + StringLen (dlp->m_second_super_kingdom) + 8;
7906 tmp = (CharPtr) MemNew (sizeof (Char) * len);
7907 if (tmp != NULL) {
7908 StringCat (tmp, title);
7909 StringCat (tmp, " [");
7910 StringCat (tmp, dlp->m_first_super_kingdom);
7911 StringCat (tmp, "][");
7912 StringCat (tmp, dlp->m_second_super_kingdom);
7913 StringCat (tmp, "]");
7914 MemFree (title);
7915 title = tmp;
7916 }
7917 } else if (StringDoesHaveText (taxname)) {
7918 len = StringLen (title) + StringLen (taxname) + 6;
7919 tmp = (CharPtr) MemNew (sizeof (Char) * len);
7920 if (tmp != NULL) {
7921 StringCat (tmp, title);
7922 StringCat (tmp, " [");
7923 StringCat (tmp, taxname);
7924 StringCat (tmp, "]");
7925 MemFree (title);
7926 title = tmp;
7927 }
7928 }
7929
7930 if (result == NULL) {
7931 result = StringSave (title);
7932 }
7933
7934 MemFree (title);
7935
7936 return result;
7937 }
7938
x_TitleFromSegSeq(DefLinePtr dlp)7939 static CharPtr x_TitleFromSegSeq (
7940 DefLinePtr dlp
7941 )
7942
7943 {
7944 BioseqPtr bsp;
7945 SeqMgrFeatContext ccontext;
7946 SeqFeatPtr cds;
7947 CharPtr cln = NULL;
7948 CharPtr complete = "gene, complete cds";
7949 Uint2 entityID;
7950 SeqMgrFeatContext gcontext;
7951 SeqFeatPtr gene;
7952 GeneRefPtr grp;
7953 CharPtr label = NULL;
7954 size_t len;
7955 CharPtr locus = NULL;
7956 CharPtr modifier = NULL;
7957 CharPtr product = NULL;
7958 CharPtr result = NULL;
7959 CharPtr str;
7960 CharPtr taxname = NULL;
7961 ValNodePtr vnp;
7962
7963 if (dlp == NULL) return NULL;
7964
7965 bsp = dlp->m_bioseq;
7966 if (bsp == NULL) return NULL;
7967
7968 entityID = ObjMgrGetEntityIDForPointer (bsp);
7969 if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
7970 SeqMgrIndexFeatures (entityID, NULL);
7971 }
7972
7973 cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext);
7974
7975 if (cds != NULL) {
7976 if (cds->partial) {
7977 complete = "gene, partial cds";
7978 }
7979 product = ccontext.label;
7980 grp = SeqMgrGetGeneXref (cds);
7981 if (grp != NULL) {
7982 if (StringDoesHaveText (grp->locus)) {
7983 locus = grp->locus;
7984 } else {
7985 vnp = grp->syn;
7986 if (vnp != NULL) {
7987 str = (CharPtr) vnp->data.ptrvalue;
7988 if (StringDoesHaveText (str)) {
7989 locus = str;
7990 }
7991 }
7992 }
7993 }
7994 if (locus == NULL) {
7995 gene = SeqMgrGetOverlappingGene (cds->location, &gcontext);
7996 if (gene != NULL) {
7997 locus = gcontext.label;
7998 }
7999 }
8000 } else {
8001 if (StringDoesHaveText (dlp->m_strain) && (! x_EndsWithStrain (dlp, dlp->m_strain))) {
8002 modifier = dlp->m_strain;
8003 label = " strain ";
8004 } else if (StringDoesHaveText (dlp->m_clone)) {
8005 cln = x_DescribeClones (dlp);
8006 modifier = cln;
8007 } else if (StringDoesHaveText (dlp->m_isolate)) {
8008 modifier = dlp->m_isolate;
8009 label = " isolate ";
8010 }
8011 }
8012
8013 taxname = dlp->m_taxname;
8014 if (StringHasNoText (taxname)) {
8015 taxname = "Unknown";
8016 }
8017
8018 len = StringLen (taxname) + StringLen (label) + StringLen (modifier) +
8019 StringLen (product) + StringLen (locus) + StringLen (complete) + 10;
8020
8021 result = (CharPtr) MemNew (sizeof (Char) * len);
8022 if (result == NULL) {
8023 MemFree (cln);
8024 return NULL;
8025 }
8026
8027 if (taxname != NULL) {
8028 StringCat (result, taxname);
8029 }
8030
8031 if (modifier != NULL) {
8032 if (label != NULL) {
8033 StringCat (result, label);
8034 }
8035 StringCat (result, modifier);
8036 }
8037
8038 if (product != NULL) {
8039 StringCat (result, " ");
8040 StringCat (result, product);
8041 }
8042 if (locus != NULL) {
8043 StringCat (result, " (");
8044 StringCat (result, locus);
8045 StringCat (result, ")");
8046 }
8047 if (product != NULL || locus != NULL) {
8048 StringCat (result, " ");
8049 StringCat (result, complete);
8050 }
8051 TrimSpacesAroundString (result);
8052
8053 MemFree (cln);
8054
8055 return result;
8056 }
8057
x_StringInList(ValNodePtr strings,CharPtr str)8058 static Boolean x_StringInList (
8059 ValNodePtr strings,
8060 CharPtr str
8061 )
8062
8063 {
8064 CharPtr tmp;
8065 ValNodePtr vnp;
8066
8067 if (strings == NULL || StringHasNoText (str)) return FALSE;
8068
8069 for (vnp = strings; vnp != NULL; vnp = vnp->next) {
8070 tmp = (CharPtr) vnp->data.ptrvalue;
8071 if (StringStr (tmp, str) != NULL) return TRUE;
8072 }
8073
8074 return FALSE;
8075 }
8076
8077
x_TitleFromWGS(DefLinePtr dlp)8078 static CharPtr x_TitleFromWGS (
8079 DefLinePtr dlp
8080 )
8081
8082 {
8083 CharPtr result = NULL, cln, mod, ptr;
8084 ValNodePtr strings = NULL;
8085
8086 if (dlp == NULL) return NULL;
8087
8088 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
8089
8090 if (StringDoesHaveText (dlp->m_strain)) {
8091 mod = StringSave (dlp->m_strain);
8092 ptr = StringChr (mod, ';');
8093 if (ptr != NULL) {
8094 *ptr = '\0';
8095 }
8096 if (! x_EndsWithStrain (dlp, mod)) {
8097 ValNodeCopyStr (&strings, 0, " strain ");
8098 ValNodeCopyStr (&strings, 0, mod);
8099 }
8100 MemFree (mod);
8101 } else if (StringDoesHaveText (dlp->m_breed)) {
8102 ValNodeCopyStr (&strings, 0, " breed ");
8103 mod = StringSave (dlp->m_breed);
8104 ptr = StringChr (mod, ';');
8105 if (ptr != NULL) {
8106 *ptr = '\0';
8107 }
8108 ValNodeCopyStr (&strings, 0, mod);
8109 MemFree (mod);
8110 } else if (StringDoesHaveText (dlp->m_cultivar)) {
8111 ValNodeCopyStr (&strings, 0, " cultivar ");
8112 mod = StringSave (dlp->m_cultivar);
8113 ptr = StringChr (mod, ';');
8114 if (ptr != NULL) {
8115 *ptr = '\0';
8116 }
8117 ValNodeCopyStr (&strings, 0, mod);
8118 MemFree (mod);
8119 }
8120
8121 if (StringDoesHaveText (dlp->m_isolate)) {
8122 /* x_EndsWithStrain just checks for supplied pattern, using here for isolate */
8123 if (! x_EndsWithStrain (dlp, dlp->m_isolate)) {
8124 ValNodeCopyStr (&strings, 0, " isolate ");
8125 ValNodeCopyStr (&strings, 0, dlp->m_isolate);
8126 }
8127 }
8128
8129 if (StringDoesHaveText (dlp->m_chromosome)) {
8130 ValNodeCopyStr (&strings, 0, " chromosome ");
8131 ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
8132 }
8133
8134 cln = x_DescribeClones (dlp);
8135 if (StringDoesHaveText (cln)) {
8136 ValNodeCopyStr (&strings, 0, cln);
8137 }
8138 MemFree (cln);
8139
8140 if (StringDoesHaveText (dlp->m_map)) {
8141 ValNodeCopyStr (&strings, 0, " map ");
8142 ValNodeCopyStr (&strings, 0, dlp->m_map);
8143 }
8144
8145 if (StringDoesHaveText (dlp->m_plasmid)) {
8146 if (dlp->m_is_wgs) {
8147 ValNodeCopyStr (&strings, 0, " plasmid ");
8148 ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
8149 }
8150 }
8151
8152 if (dlp->m_genome == GENOME_plasmid && dlp->m_topology == TOPOLOGY_CIRCULAR) {
8153 } else if (dlp->m_genome == GENOME_chromosome) {
8154 } else if (StringDoesHaveText (dlp->m_general_str) && StringICmp (dlp->m_general_str, dlp->m_chromosome) != 0) {
8155 ValNodeCopyStr (&strings, 0, " ");
8156 ValNodeCopyStr (&strings, 0, dlp->m_general_str);
8157 }
8158
8159 result = x_CatenateValNodeStrings (strings);
8160 ValNodeFreeData (strings);
8161 if (result == NULL) return NULL;
8162
8163 return result;
8164 }
8165
x_TitleFromMap(DefLinePtr dlp)8166 static CharPtr x_TitleFromMap (
8167 DefLinePtr dlp
8168 )
8169
8170 {
8171 BioseqPtr bsp;
8172 CharPtr result = NULL, mod, ptr;
8173 ValNodePtr strings = NULL;
8174
8175 if (dlp == NULL) return NULL;
8176
8177 bsp = dlp->m_bioseq;
8178 if (bsp == NULL) return NULL;
8179 if (bsp->seq_ext_type != 3) return NULL;
8180 if (bsp->seq_ext == NULL) return NULL;
8181
8182 ValNodeCopyStr (&strings, 0, dlp->m_taxname);
8183
8184 if (StringDoesHaveText (dlp->m_strain)) {
8185 mod = StringSave (dlp->m_strain);
8186 ptr = StringChr (mod, ';');
8187 if (ptr != NULL) {
8188 *ptr = '\0';
8189 }
8190 if (! x_EndsWithStrain (dlp, mod)) {
8191 ValNodeCopyStr (&strings, 0, " strain ");
8192 ValNodeCopyStr (&strings, 0, mod);
8193 }
8194 MemFree (mod);
8195 }
8196
8197 if (StringDoesHaveText (dlp->m_chromosome)) {
8198 ValNodeCopyStr (&strings, 0, " chromosome ");
8199 ValNodeCopyStr (&strings, 0, dlp->m_chromosome);
8200 } else if (dlp->m_is_chromosome) {
8201 ValNodeCopyStr (&strings, 0, " chromosome");
8202 }
8203
8204 if (StringDoesHaveText (dlp->m_plasmid)) {
8205 ValNodeCopyStr (&strings, 0, " plasmid ");
8206 ValNodeCopyStr (&strings, 0, dlp->m_plasmid);
8207 } else if (dlp->m_is_plasmid) {
8208 ValNodeCopyStr (&strings, 0, " plasmid");
8209 }
8210
8211 if (StringDoesHaveText (dlp->m_isolate)) {
8212 ValNodeCopyStr (&strings, 0, " isolate ");
8213 ValNodeCopyStr (&strings, 0, dlp->m_isolate);
8214 }
8215
8216 if (StringDoesHaveText (dlp->m_enzyme)) {
8217 ValNodeCopyStr (&strings, 0, ", ");
8218 ValNodeCopyStr (&strings, 0, dlp->m_enzyme);
8219 ValNodeCopyStr (&strings, 0, " whole genome map");
8220 }
8221
8222 result = x_CatenateValNodeStrings (strings);
8223 ValNodeFreeData (strings);
8224 if (result == NULL) return NULL;
8225
8226 return result;
8227 }
8228
x_SetPrefix(DefLinePtr dlp,CharPtr title)8229 static CharPtr x_SetPrefix (
8230 DefLinePtr dlp,
8231 CharPtr title
8232 )
8233
8234 {
8235 CharPtr prefix = NULL;
8236
8237 if (dlp == NULL) return NULL;
8238
8239 if (dlp->m_is_unverified) {
8240 if (StringStr (title, "UNVERIFIED") == NULL) {
8241 prefix = "UNVERIFIED: ";
8242 }
8243 } else if (dlp->m_is_tsa) {
8244 prefix = "TSA: ";
8245 } else if (dlp->m_is_tls) {
8246 prefix = "TLS: ";
8247 } else if (dlp->m_third_party) {
8248 if (dlp->m_tpa_exp) {
8249 prefix = "TPA_exp: ";
8250 } else if (dlp->m_tpa_inf) {
8251 prefix = "TPA_inf: ";
8252 } else if (dlp->m_tpa_reasm) {
8253 prefix = "TPA_asm: ";
8254 } else {
8255 prefix = "TPA: ";
8256 }
8257 } else if (dlp->m_multispecies && dlp->m_is_wp) {
8258 prefix = "MULTISPECIES: ";
8259 } else if (dlp->m_is_pseudogene) {
8260 if (StringStr (title, "PUTATIVE PSEUDOGENE") == NULL) {
8261 prefix = "PUTATIVE PSEUDOGENE: ";
8262 }
8263 }
8264
8265 return StringSave (prefix);
8266 }
8267
CountDeltaGaps(BioseqPtr bsp)8268 static Int4 CountDeltaGaps (
8269 BioseqPtr bsp
8270 )
8271
8272 {
8273 DeltaSeqPtr dsp;
8274 Int4 num_gaps = 0;
8275 SeqLitPtr slitp;
8276 SeqLocPtr slocp;
8277
8278 if (bsp == NULL) return 0;
8279
8280 if (bsp->repr == Seq_repr_delta) {
8281 for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
8282 switch (dsp->choice) {
8283 case 1:
8284 slocp = (SeqLocPtr)(dsp->data.ptrvalue);
8285 if (slocp == NULL) break;
8286 if (slocp->choice == SEQLOC_NULL) {
8287 num_gaps++;
8288 }
8289 break;
8290 case 2:
8291 slitp = (SeqLitPtr)(dsp->data.ptrvalue);
8292 if (slitp == NULL) break;
8293 if (slitp->seq_data == NULL || slitp->seq_data_type == Seq_code_gap) {
8294 num_gaps++;
8295 }
8296 break;
8297 default:
8298 break;
8299 }
8300 }
8301 }
8302
8303 return num_gaps;
8304 }
8305
x_SetSuffix(DefLinePtr dlp,CharPtr title,Boolean appendComplete)8306 static CharPtr x_SetSuffix (
8307 DefLinePtr dlp,
8308 CharPtr title,
8309 Boolean appendComplete
8310 )
8311
8312 {
8313 Char buf1 [512], buf2 [256];
8314 CharPtr compl = "", study = "", type = "", un = "ordered", suffix;
8315 size_t len;
8316 Int4 num_segs, num_gaps;
8317
8318 if (dlp == NULL) return NULL;
8319
8320 buf1 [0] = '\0';
8321 buf2 [0] = '\0';
8322
8323 switch (dlp->m_mi_tech) {
8324 case MI_TECH_htgs_0 :
8325 if (StringStr (title, "LOW-PASS") == NULL) {
8326 type = ", LOW-PASS SEQUENCE SAMPLING";
8327 }
8328 break;
8329 case MI_TECH_htgs_1 :
8330 un = "unordered";
8331 /* fall through */
8332 case MI_TECH_htgs_2 :
8333 if (dlp->m_htgs_draft) {
8334 if (StringStr (title, "WORKING DRAFT") == NULL) {
8335 type = ", WORKING DRAFT SEQUENCE";
8336 }
8337 } else if (! dlp->m_htgs_cancelled) {
8338 if (StringStr (title, "SEQUENCING IN") == NULL) {
8339 type = ", *** SEQUENCING IN PROGRESS ***";
8340 }
8341 }
8342 if (dlp->m_is_delta) {
8343 if (CountGapsInDeltaSeq (dlp->m_bioseq, &num_segs, &num_gaps, NULL, NULL, NULL, 0)) {
8344 if (num_gaps > 0) {
8345 sprintf (buf1, "%s, %ld %s pieces", type, (long) (num_gaps + 1), un);
8346 type = buf1;
8347 }
8348 }
8349 }
8350 break;
8351 case MI_TECH_htgs_3 :
8352 if (StringStr (title, "complete sequence") == NULL) {
8353 type = ", complete sequence";
8354 }
8355 break;
8356 case MI_TECH_est :
8357 if (StringStr (title, "mRNA sequence") == NULL) {
8358 type = ", mRNA sequence";
8359 }
8360 break;
8361 case MI_TECH_sts :
8362 if (StringStr (title, "sequence tagged site") == NULL) {
8363 type = ", sequence tagged site";
8364 }
8365 break;
8366 case MI_TECH_survey :
8367 if (StringStr (title, "genomic survey sequence") == NULL) {
8368 type = ", genomic survey sequence";
8369 }
8370 break;
8371 case MI_TECH_wgs :
8372 if (dlp->m_wgs_master) {
8373 if (StringStr (title, "whole genome shotgun sequencing") == NULL) {
8374 type = ", whole genome shotgun sequencing project";
8375 }
8376 } else if (StringStr (title, "whole genome shotgun sequence") == NULL) {
8377 if (StringDoesHaveText (dlp->m_organelle) && StringStr (title, dlp->m_organelle) == NULL) {
8378 StringCat (buf1, " ");
8379 StringCat (buf1, dlp->m_organelle);
8380 }
8381 StringCat (buf1, ", whole genome shotgun sequence");
8382 type = buf1;
8383 }
8384 break;
8385 case MI_TECH_tsa :
8386 if (dlp->m_tsa_master) {
8387 if (StringStr (title, "transcriptome shotgun assembly") == NULL) {
8388 type = ", transcriptome shotgun assembly";
8389 }
8390 } else if (StringStr (title, "RNA sequence") == NULL) {
8391 switch (dlp->m_mi_biomol) {
8392 case MOLECULE_TYPE_MRNA :
8393 type = ", mRNA sequence";
8394 break;
8395 case MOLECULE_TYPE_RRNA :
8396 type = ", rRNA sequence";
8397 break;
8398 case MOLECULE_TYPE_NCRNA :
8399 type = ", ncRNA sequence";
8400 break;
8401 case MOLECULE_TYPE_PRE_MRNA :
8402 case MOLECULE_TYPE_SNRNA :
8403 case MOLECULE_TYPE_SCRNA :
8404 case MOLECULE_TYPE_CRNA :
8405 case MOLECULE_TYPE_SNORNA :
8406 case MOLECULE_TYPE_TRANSCRIBED_RNA :
8407 type = ", transcribed RNA sequence";
8408 break;
8409 default :
8410 break;
8411 }
8412 }
8413 break;
8414 case MI_TECH_targeted :
8415 if (dlp->m_tls_master) {
8416 if (StringStr (title, "targeted locus study") == NULL) {
8417 type = ", targeted locus study";
8418 }
8419 } else {
8420 if (StringStr (title, "sequence") == NULL) {
8421 type = ", sequence";
8422 }
8423 }
8424 if (StringDoesHaveText (dlp->m_targeted_locus) && StringStr (title, dlp->m_targeted_locus) == NULL) {
8425 study = dlp->m_targeted_locus;
8426 }
8427 break;
8428 default :
8429 break;
8430 }
8431
8432 if (appendComplete && StringStr (title, "complete") == NULL && StringStr (title, "partial") == NULL) {
8433 if (dlp->m_mi_completeness == 1) {
8434 if (dlp->m_is_plasmid) {
8435 compl = ", complete sequence";
8436 } else if (dlp->m_genome == GENOME_mitochondrion ||
8437 dlp->m_genome == GENOME_chloroplast ||
8438 dlp->m_genome == GENOME_kinetoplast ||
8439 dlp->m_genome == GENOME_plastid ||
8440 dlp->m_genome == GENOME_apicoplast) {
8441 compl = ", complete genome";
8442 } else if (dlp->m_is_chromosome) {
8443 if (StringDoesHaveText (dlp->m_chromosome)) {
8444 compl = ", complete sequence";
8445 } else {
8446 compl = ", complete genome";
8447 }
8448 }
8449 }
8450 }
8451
8452 if (dlp->m_unordered && dlp->m_is_delta) {
8453 num_gaps = CountDeltaGaps (dlp->m_bioseq);
8454 if (num_gaps > 0) {
8455 sprintf (buf1, ", %ld unordered pieces", (long) (num_gaps + 1));
8456 type = buf1;
8457 }
8458 }
8459
8460 len = StringLen (type) + StringLen (study) + StringLen (compl) + 5;
8461 suffix = (CharPtr) MemNew (len * sizeof (Char));
8462 if (suffix == NULL) return NULL;
8463
8464 suffix [0] = '\0';
8465 if (StringDoesHaveText (study)) {
8466 StringCat (suffix, " ");
8467 StringCat (suffix, study);
8468 }
8469 StringCat (suffix, type);
8470 StringCat (suffix, compl);
8471
8472 return suffix;
8473 }
8474
8475 static CharPtr tpa_prefix_list [] = {
8476 "TPA:",
8477 "TPA_exp:",
8478 "TPA_inf:",
8479 "TPA_reasm:",
8480 "TPA_asm:",
8481 "TSA:",
8482 "UNVERIFIED:",
8483 NULL
8484 };
8485
NewCreateDefLineExEx(ItemInfoPtr iip,BioseqPtr bsp,Boolean ignoreTitle,Boolean extProtTitle,Boolean gpipeMode,Boolean devMode)8486 NLM_EXTERN CharPtr NewCreateDefLineExEx (
8487 ItemInfoPtr iip,
8488 BioseqPtr bsp,
8489 Boolean ignoreTitle,
8490 Boolean extProtTitle,
8491 Boolean gpipeMode,
8492 Boolean devMode
8493 )
8494
8495 {
8496 Boolean appendComplete = FALSE;
8497 Boolean capitalize = TRUE;
8498 Char ch;
8499 DefLinePtr dlp;
8500 Uint2 entityID;
8501 int i;
8502 size_t len;
8503 ObjValNodePtr ovp;
8504 CharPtr result = NULL, prefix = NULL, suffix = NULL, title = NULL, fix = NULL;
8505 SeqDescrPtr sdp = NULL;
8506 CharPtr str = NULL;
8507
8508 if (bsp == NULL) return NULL;
8509
8510 /* now using GetNextDescriptorUnindexed, so need to have called AssignIDsInEntityEx */
8511 if (bsp->idx.entityID == 0) {
8512 entityID = ObjMgrGetEntityIDForPointer (bsp);
8513 if (entityID != 0) {
8514 AssignIDsInEntityEx (entityID, 0, NULL, NULL);
8515 }
8516 }
8517
8518 dlp = (DefLinePtr) MemNew (sizeof (DefLineData));
8519 if (dlp == NULL) return NULL;
8520
8521 dlp->m_low_quality_fsa = TextFsaNew ();
8522 TextFsaAdd (dlp->m_low_quality_fsa, "heterogeneous population sequenced");
8523 TextFsaAdd (dlp->m_low_quality_fsa, "low-quality sequence region");
8524 TextFsaAdd (dlp->m_low_quality_fsa, "unextendable partial coding region");
8525
8526 /* set flags from record components */
8527 dlp->m_iip = iip;
8528 dlp->m_bioseq = bsp;
8529
8530 dlp->m_reconstruct = ignoreTitle;
8531 dlp->m_allprotnames = extProtTitle;
8532
8533 dlp->m_gpipemode = gpipeMode;
8534 dlp->m_devmode = devMode;
8535
8536 /* clear ItemInfo fields */
8537 if (iip != NULL) {
8538 iip->entityID = 0;
8539 iip->itemID = 0;
8540 iip->itemtype = 0;
8541 }
8542
8543 /* set flags from record components */
8544 x_SetFlags (dlp);
8545
8546 if (! dlp->m_reconstruct) {
8547 /* look for existing instantiated title */
8548 if (dlp->m_is_aa && (! dlp->m_is_pdb)) {
8549 sdp = BioseqGetSeqDescr (bsp, Seq_descr_title, NULL);
8550 if (sdp != NULL && sdp->choice == Seq_descr_title) {
8551 str = (CharPtr) sdp->data.ptrvalue;
8552 }
8553 } else {
8554 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_title, NULL);
8555 if (sdp != NULL && sdp->choice == Seq_descr_title) {
8556 str = (CharPtr) sdp->data.ptrvalue;
8557 }
8558 }
8559 if (StringDoesHaveText (str)) {
8560 title = StringSave (str);
8561 /* strip trailing periods, commas, semicolons, etc. */
8562 x_TrimPunctuationFromEnd (title);
8563 capitalize = FALSE;
8564
8565 /* set ItemInfo fields for selection */
8566 if (iip != NULL && sdp != NULL && sdp->extended != 0) {
8567 ovp = (ObjValNodePtr) sdp;
8568 iip->entityID = ovp->idx.entityID;
8569 iip->itemtype = ovp->idx.itemtype;
8570 iip->itemID = ovp->idx.itemID;
8571 }
8572 }
8573 }
8574
8575 /* use appropriate algorithm if title needs to be generated */
8576 if (StringHasNoText (title)) {
8577 /* PDB and patent records do not normally need source data */
8578 if (dlp->m_is_pdb) {
8579 title = x_TitleFromPDB (dlp);
8580 } else if (dlp->m_is_patent) {
8581 title = x_TitleFromPatent (dlp);
8582 }
8583
8584 if (StringHasNoText (title)) {
8585 /* set fields from source information */
8586 x_SetBioSrc (dlp);
8587
8588 /* several record types have specific methods */
8589 if (dlp->m_is_nc) {
8590 title = x_TitleFromNC (dlp);
8591 } else if (dlp->m_is_nm) {
8592 title = x_TitleFromNM (dlp);
8593 } else if (dlp->m_is_nr) {
8594 title = x_TitleFromNR (dlp);
8595 } else if (dlp->m_is_aa) {
8596 title = x_TitleFromProtein (dlp);
8597 } else if (dlp->m_is_seg && (! dlp->m_is_est_sts_gss)) {
8598 title = x_TitleFromSegSeq (dlp);
8599 } else if (dlp->m_is_tsa || (dlp->m_is_wgs && (! dlp->m_wgs_master)) || (dlp->m_is_tls && (! dlp->m_tls_master))) {
8600 title = x_TitleFromWGS (dlp);
8601 } else if (dlp->m_is_map) {
8602 title = x_TitleFromMap (dlp);
8603 }
8604 }
8605
8606 if (StringHasNoText (title) && dlp->m_gpipemode) {
8607 /* title using gpipe policy */
8608 title = x_TitleFromGPipe (dlp);
8609 }
8610
8611 if (StringHasNoText (title)) {
8612 /* default title using source fields */
8613 title = x_TitleFromBioSrc (dlp);
8614 if (dlp->m_mi_completeness == 1 && StringDoesHaveText (title)) {
8615 appendComplete = TRUE;
8616 }
8617 }
8618
8619 if (StringHasNoText (title)) {
8620 /* last resort title created here */
8621 /*
8622 title = StringSave ("No definition line found");
8623 */
8624 }
8625 }
8626
8627 /* remove TPA or TSA prefix, will rely on other data in record to set */
8628 for (i = 0; tpa_prefix_list [i] != NULL; i++) {
8629 len = StringLen (tpa_prefix_list [i]);
8630 if (StringNICmp (title, tpa_prefix_list [i], len) == 0) {
8631 x_TrimFirstNCharacters (title, len);
8632 }
8633 }
8634
8635 /* strip leading spaces remaining after removal of old TPA or TSA prefixes */
8636 TrimSpacesAroundString (title);
8637
8638 /* strip trailing commas, semicolons, and spaces (period may be an sp. species) */
8639 x_TrimMostPunctFromEnd (title);
8640
8641 /* calcualte prefix */
8642 prefix = x_SetPrefix (dlp, title);
8643
8644 /* calculate suffix */
8645 suffix = x_SetSuffix (dlp, title, appendComplete);
8646
8647 len = StringLen (prefix) + StringLen (title) + StringLen (suffix) + 4;
8648 result = (CharPtr) MemNew (sizeof (Char) * len);
8649
8650 if (result != NULL) {
8651 StringCat (result, prefix);
8652 StringCat (result, title);
8653 StringCat (result, suffix);
8654
8655 if (dlp->m_is_aa) {
8656 fix = StringStr (result, ". [");
8657 if (fix == NULL) {
8658 fix = StringStr (result, ", [");
8659 }
8660 if (fix != NULL) {
8661 *fix = ' ';
8662 }
8663 }
8664
8665 fix = StringStr (result, " ,");
8666 if (fix != NULL) {
8667 fix [0] = ',';
8668 fix [1] = ' ';
8669 }
8670
8671 fix = StringStr (result, ",,");
8672 if (fix != NULL) {
8673 fix [1] = ' ';
8674 }
8675 }
8676
8677 MemFree (prefix);
8678 MemFree (title);
8679 MemFree (suffix);
8680
8681 TextFsaFree (dlp->m_low_quality_fsa);
8682
8683 Asn2gnbkCompressSpaces (result);
8684
8685 if (! dlp->m_is_pdb && ! dlp->m_is_patent && ! dlp->m_is_aa && ! dlp->m_is_seg) {
8686 if (result != NULL) {
8687 ch = result [0];
8688 if (IS_LOWER (ch) && capitalize) {
8689 result [0] = TO_UPPER (ch);
8690 }
8691 }
8692 }
8693
8694 dlp = MemFree (dlp);
8695
8696 return result;
8697 }
8698
NewCreateDefLineEx(ItemInfoPtr iip,BioseqPtr bsp,Boolean ignoreTitle,Boolean extProtTitle,Boolean gpipeMode)8699 NLM_EXTERN CharPtr NewCreateDefLineEx (
8700 ItemInfoPtr iip,
8701 BioseqPtr bsp,
8702 Boolean ignoreTitle,
8703 Boolean extProtTitle,
8704 Boolean gpipeMode
8705 )
8706
8707 {
8708 return NewCreateDefLineExEx (iip, bsp, ignoreTitle, extProtTitle, gpipeMode, FALSE);
8709 }
8710
NewCreateDefLine(ItemInfoPtr iip,BioseqPtr bsp,Boolean ignoreTitle,Boolean extProtTitle)8711 NLM_EXTERN CharPtr NewCreateDefLine (
8712 ItemInfoPtr iip,
8713 BioseqPtr bsp,
8714 Boolean ignoreTitle,
8715 Boolean extProtTitle
8716 )
8717
8718 {
8719 return NewCreateDefLineExEx (iip, bsp, ignoreTitle, extProtTitle, FALSE, FALSE);
8720 }
8721
NewCreateDefLineBuf(ItemInfoPtr iip,BioseqPtr bsp,CharPtr buf,Uint4 buflen,Boolean ignoreTitle,Boolean extProtTitle)8722 NLM_EXTERN Boolean NewCreateDefLineBuf (
8723 ItemInfoPtr iip,
8724 BioseqPtr bsp,
8725 CharPtr buf,
8726 Uint4 buflen,
8727 Boolean ignoreTitle,
8728 Boolean extProtTitle)
8729
8730 {
8731 CharPtr title = NULL;
8732
8733 if (bsp == NULL || buf == NULL|| buflen == 0) return FALSE;
8734
8735 title = NewCreateDefLineEx (iip, bsp, ignoreTitle, extProtTitle, FALSE);
8736 StringNCpy_0 (buf, title, buflen);
8737 MemFree (title);
8738
8739 return TRUE;
8740 }
8741
8742