1 /*  subutil.h
2 * >> Set tabs to 4 spaces for a nice printout
3 *
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name:  subutil.h
29 *
30 * Author:  James Ostell
31 *
32 * Version Creation Date: 11/3/93
33 *
34 * $Revision: 6.93 $
35 *
36 * File Description: Utilities for creating ASN.1 submissions
37 *
38 * Modifications:
39 * --------------------------------------------------------------------------
40 * Date	   Name        Description of modification
41 * -------  ----------  -----------------------------------------------------
42 *
43 * ==========================================================================
44 */
45 
46 #ifndef _NCBI_SubUtil_
47 #define _NCBI_SubUtil_
48 
49 #ifndef _NCBI_Submit_
50 #include <objsub.h>
51 #endif
52 
53 #undef NLM_EXTERN
54 #ifdef NLM_IMPORT
55 #define NLM_EXTERN NLM_IMPORT
56 #else
57 #define NLM_EXTERN extern
58 #endif
59 
60 #ifdef __cplusplus
61 extern "C" {
62 #endif
63 
64 
65 /*****************************************************************************
66 *
67 *   Create a GenBank direct submission
68 *   	This supports a basic set of datatypes for making a new direct
69 *   submission to GenBank in ASN.1. It is designed for folks wanting to
70 *   read their own data storage format, then make a valid direct submission
71 *   without going through an intermediate tool.
72 *
73 *   	You may have many "entries" in a single submission. A single entry
74 *   may contain:
75 *   	One protein sequence       (called a "Bioseq")
76 *   	One nucleic acid sequence  (called a "Bioseq")
77 *   	One nucleic acid sequence for which you only have a series of
78 *          sequence pieces (e.g. you sequenced around the exons of a
79 *          genomic sequence, but not the introns) (called a "segmented
80 *          set")
81 *   	One nucleic acid sequence and the protein sequences it codes for.
82 *          (nucleic acid may a a single Bioseq or a segmented set)
83 *          (this entry called a "nuc-prot set")
84 *
85 *   NCBI considers the protein sequences part
86 *   of the submission, and they are created as proteins in their own right
87 *   by the routines below. You can either supply the protein sequence from
88 *   your own software (best case), in which we check that the coding region
89 *   you supply translates to it. If you do not supply a protein sequence,
90 *   then all we can do is check that it translates without stops.
91 *
92 *   	NCBI also considers "gene" to refer to a region of nucleic acid
93 *   within which are found elements (such as promoters, coding regions,etc)
94 *   leading to a phenomenon recognized as a gene (note this also accomodates
95 *   anonymous markers as well as expressed products). This is in contrast to
96 *   so other notions that a gene is simply a qualifier on other features of
97 *   the DNA. A separate function to produce a gene feature is supplied. The
98 *   intervals given for it should include the intervals for the other
99 *   features it contains.
100 *
101 *   	The process of building the direct submission is roughly:
102 *
103 *   Create the submission
104 *   	Add the submission citation
105 *   	Create an entry  (can be 1 or more sequences)
106 *   		Add the organism information
107 *   		Add any publication citations
108 *   		Add the sequences
109 *             Fill in the residues
110 *   		Add the features
111 *   		Validate the entry
112 *   Write the entry
113 *   Free the memory used
114 *
115 *   Each element may have subfunctions:
116 *
117 *   Create a citation
118 *   	Add author names
119 *   	Add author affiliation
120 *
121 *   Create a sequence
122 *   	Add modifiers
123 *
124 *   Create a feature
125 *   	Add information specific to type of feature
126 *   	Add intervals on the sequence
127 *
128 *****************************************************************************/
129 typedef Boolean (* SubErrorFunc) (CharPtr msg);
130 
131 typedef struct ncbisub {
132 	SeqSubmitPtr ssp;               /* the submission */
133 	SubErrorFunc err_func;          /* the error handler */
134 	Int2 gap_count;                 /* for unique gap names in segs */
135 	CharPtr submittor_key;          /* used for turning local SeqId to General */
136 } NCBISub, PNTR NCBISubPtr;
137 
138 #define PubPtr ValNodePtr           /* should really be typedeffed */
139 
140 /*****************************************************************************
141 *
142 *   Prototypes for building a direct submission
143 *
144 *****************************************************************************/
145 
146 						 /* default error handler */
147 
148 NLM_EXTERN Boolean DefaultSubErrorFunc (CharPtr msg);
149 
150 /*****************************************************************************
151 *
152 *   Create/Free the NCBISub
153 *
154 *****************************************************************************/
155 
156 NLM_EXTERN NCBISubPtr NCBISubCreate (
157 	CharPtr last_name,
158 	CharPtr first_name,
159 	CharPtr middle_name,
160 	CharPtr initials,  /* separated by periods, no initial for last name */
161 	CharPtr suffix,    /* Jr. Sr. III */
162 	CharPtr affil,        /* e.g. "Xyz University" */
163 	CharPtr div,          /* e.g. "Dept of Biology" */
164 	CharPtr street,       /* e.g. "123 Academic Road" */
165 	CharPtr city,         /* e.g. "Metropolis" */
166 	CharPtr sub,          /* e.g. "Massachusetts" */
167 	CharPtr country ,     /* e.g. "USA" */
168 	CharPtr postal_code,  /* e.g."02133" */
169 	CharPtr phone ,
170 	CharPtr fax ,
171 	CharPtr email,
172 	Boolean hold_until_publish ,
173 	Int2 release_month ,
174 	Int2 release_day ,
175 	Int2 release_year );
176 
177 NLM_EXTERN Boolean DefineSubmittorKey(
178 	NCBISubPtr nsp,
179 	CharPtr submittor_key );  /* submitting large scale lab, for regular submissions */
180 
181 /**** WARNING: NCBISubBuild() is the old style submission         ***/
182 /**** It has been replaced by  NCBISubCreate()                    ***/
183 /**** NCBISubBuild will be discontinued                           ***/
184 
185 NLM_EXTERN NCBISubPtr NCBISubBuild (
186 	CharPtr name,
187 	CharPtr PNTR address ,
188 	CharPtr phone ,
189 	CharPtr fax ,
190 	CharPtr email,
191 	Boolean hold_until_publish ,
192 	Int2 release_month,
193 	Int2 release_day,
194 	Int2 release_year );
195 
196 
197 
198                /** every submission must have 1 submission citation **/
199                /** see below to add authors and affiliation **********/
200 
201 NLM_EXTERN Boolean CitSubForSubmission (
202 	NCBISubPtr submission,
203 	PubPtr cit_sub );
204 
205 NLM_EXTERN Boolean AddToolToSub (
206 	NCBISubPtr nsp,
207 	CharPtr tool );
208 
209 NLM_EXTERN Boolean AddCommentToSub (
210 	NCBISubPtr nsp,
211 	CharPtr comment );
212 
213 NLM_EXTERN Boolean AddTypeToSub (
214 	NCBISubPtr nsp,
215 	Uint1 type );
216 
217 NLM_EXTERN Boolean NCBISubWrite (
218 	NCBISubPtr ssp,
219 	CharPtr filename );
220 
221 NLM_EXTERN NCBISubPtr NCBISubFree (
222 	NCBISubPtr ssp );
223 
224 /*****************************************************************************
225 *
226 *   You can (should) run the ncbi validator routines on your final submission.
227 *   It returns a count of all errors or questions found.
228 *
229 *   The errfile parameter is no longer supported. Errors are directed
230 *   based on the ErrLog functions in the toolkit. If you have done none
231 *   of this, then errors will appear on stderr.
232 *
233 *****************************************************************************/
234 
235 NLM_EXTERN Int2 NCBISubValidate (NCBISubPtr nsp, FILE * errfile);
236 
237 /*****************************************************************************
238 *
239 *   Add Entries to the Submission
240 *   Add Sequences to the Entries
241 *
242 *****************************************************************************/
243 
244 /*****************************************************************************
245 *
246 *   About Sequence Identifiers:
247 *
248 *   Note that in all functions below where you create a Bioseq in your entry,
249 *   you can supply a number of different pieces of information to make a
250 *   sequence id.
251 *
252 *   local_name: This is a string for whatever name you call this sequence
253 *               locally. Could be a clone name or whatever. There are no
254 *               limits on this other than it should be unique in the
255 *               submission. It is REQUIRED.
256 *
257 *   SeqEntryPtr: Returned by the function, this is a pointer to the Bioseq.
258 *
259 *   In later functions, such as adding feature locations, you can refer to
260 *   the Bioseq you created either with the local_name or directly with the
261 *   SeqEntryPtr. Whatever is more convenient for you is fine.
262 *
263 *   The other ids only apply to updates. These allow you to update your
264 *   entry in GenBank simply by sending a new entry with the same accession
265 *   number you were issued on the last one. In this case you should also
266 *   be sure to add the create_date, which will be returned to you in the
267 *   ASN.1 of your direct submission after processing. This is not absolutely
268 *   required, but does let us check that it is the right entry (errors
269 *   could occur when you enter your old accession number).
270 *
271 *   genbank_locus:  OPTIONAL on update. The name appearing on the LOCUS line.
272 *   genbank_accession: REQUIRED on update.
273 *   gi_number: OPTIONAL on update for now. The unique ID assigned by NCBI
274 *      to a particular sequence (DNA or protein) in your entry.
275 *
276 *   If you update your entry, whether you change the sequence or not, the
277 *   accession number and locus will remain the same, so people can retrieve
278 *   your new data with the old id. However, the gi_number is explicitly keyed
279 *   to the sequence, and will change if there are any changes/additions to
280 *   the sequence. In addition, a history will be created indicating the old
281 *   gi_number and the date the new entry replaced it. Both old and new
282 *   entries will be available from NCBI for retrieval on gi_number. Only the
283 *   new entry will appear in the next GenBank or Entrez release.
284 *
285 *****************************************************************************/
286 
287 
288             /*** Entry contains only 1 raw Bioseq ***/
289 
290 NLM_EXTERN SeqEntryPtr AddSeqOnlyToSubmission (
291 	NCBISubPtr submission ,
292 	CharPtr local_name ,
293 	CharPtr genbank_locus ,
294 	CharPtr genbank_accession ,
295 	Int4 gi_number ,
296 	Int2 molecule_class,
297 	Int2 molecule_type ,
298 	Int4 length ,
299 	Int2 topology ,
300 	Int2 strandedness );
301 
302             /*** Entry contains a segmented set of Bioseqs ***/
303 
304 NLM_EXTERN SeqEntryPtr AddSegmentedSeqToSubmission (
305 	NCBISubPtr submission ,
306 	CharPtr local_name ,
307 	CharPtr genbank_locus ,
308 	CharPtr genbank_accession ,
309 	Int4 gi_number ,
310 	Int2 molecule_class,
311 	Int2 molecule_type ,
312 	Int4 length ,
313 	Int2 topology ,
314 	Int2 strandedness );
315 
316 NLM_EXTERN SeqEntryPtr AddSeqToSegmentedEntry (
317 	NCBISubPtr submission,
318 	SeqEntryPtr segmented_seq_entry,
319 	CharPtr local_name ,
320 	CharPtr genbank_locus ,
321 	CharPtr genbank_accession ,
322 	Int4 gi_number ,
323 	Int2 molecule_class,
324 	Int2 molecule_type ,
325 	Int4 length ,
326 	Int2 topology ,
327 	Int2 strandedness );
328 
329 NLM_EXTERN Boolean AddGapToSegmentedEntry (
330 	NCBISubPtr submission,
331 	SeqEntryPtr segmented_seq_entry,
332 	Int4 length_of_gap );    /** 0 if not known */
333 
334 NLM_EXTERN Boolean AddReferenceToSegmentedEntry (
335 	NCBISubPtr submission ,
336 	SeqEntryPtr segmented_seq_entry,
337 	CharPtr genbank_accession ,
338 	Int4 gi_number ,
339 	Int4 from ,
340 	Int4 to ,
341 	Boolean on_plus_strand );
342 
343 			/*** Entry contains sets of similar sequences ***/
344 
345 NLM_EXTERN SeqEntryPtr AddPopSetToSubmission (
346 	NCBISubPtr submission );
347 
348 NLM_EXTERN SeqEntryPtr AddPhySetToSubmission (
349 	NCBISubPtr submission );
350 
351 NLM_EXTERN SeqEntryPtr AddMutSetToSubmission (
352 	NCBISubPtr submission );
353 
354 NLM_EXTERN SeqEntryPtr AddGenBankSetToSubmission (
355 	NCBISubPtr submission );
356 
357 			/*** Entry contains nucleotide and translated proteins ***/
358 
359 NLM_EXTERN SeqEntryPtr AddNucProtToSubmission (
360 	NCBISubPtr submission );
361 
362 NLM_EXTERN SeqEntryPtr AddSeqToNucProtEntry (   /** add unsegmented nuc or prot bioseq */
363 	NCBISubPtr submission,
364 	SeqEntryPtr nuc_prot_entry,
365 	CharPtr local_name ,
366 	CharPtr genbank_locus ,
367 	CharPtr genbank_accession ,
368 	Int4 gi_number ,
369 	Int2 molecule_class,
370 	Int2 molecule_type ,
371 	Int4 length ,
372 	Int2 topology ,
373 	Int2 strandedness );
374 								  /** add segmented nuc or prot bioseq set */
375 
376 NLM_EXTERN SeqEntryPtr AddSegmentedSeqToNucProtEntry (
377 	NCBISubPtr submission,
378 	SeqEntryPtr nuc_prot_entry ,
379 	CharPtr local_name ,
380 	CharPtr genbank_locus ,
381 	CharPtr genbank_accession ,
382 	Int4 gi_number ,
383 	Int2 molecule_class,
384 	Int2 molecule_type ,
385 	Int4 length ,
386 	Int2 topology ,
387 	Int2 strandedness );
388 
389 NLM_EXTERN SeqEntryPtr AddDeltaSeqToNucProtEntry (
390 	NCBISubPtr submission,
391 	SeqEntryPtr nuc_prot_entry ,
392 	CharPtr local_name ,
393 	CharPtr genbank_locus ,
394 	CharPtr genbank_accession ,
395 	Int4 gi_number ,
396 	Int2 molecule_class,
397 	Int2 molecule_type ,
398 	Int4 length ,
399 	Int2 topology ,
400 	Int2 strandedness );
401 
402 				 /**** Entry contains one delta sequence ****/
403 
404 NLM_EXTERN SeqEntryPtr AddDeltaSeqOnlyToSubmission (
405 	NCBISubPtr submission,
406 	CharPtr local_name ,
407 	CharPtr genbank_locus ,
408 	CharPtr genbank_accession ,
409 	Int4 gi_number ,
410 	Int2 molecule_class,
411 	Int2 molecule_type ,
412 	Int4 length ,
413 	Int2 topology ,
414 	Int2 strandedness );
415 
416 NLM_EXTERN Boolean AddGapToDeltaSeq (
417 	NCBISubPtr submission,
418 	SeqEntryPtr delta_seq_entry,
419 	Int4 length_of_gap );    /** 0 if not known */
420 
421 NLM_EXTERN SeqLitPtr AddFakeGapToDeltaSeq (
422 	NCBISubPtr submission,
423 	SeqEntryPtr delta_seq_entry,
424 	Int4 length_of_gap );    /** returns slp so program can set lim - unk fuzz after empty gaps are spread */
425 
426 NLM_EXTERN SeqLitPtr AddLiteralToDeltaSeq (
427 	NCBISubPtr submission,
428 	SeqEntryPtr delta_seq_entry,
429 	Int4 length_of_sequence );
430 
431 
432 #define MOLECULE_CLASS_DNA 1
433 #define MOLECULE_CLASS_RNA 2
434 #define MOLECULE_CLASS_NUC 4
435 #define MOLECULE_CLASS_PROTEIN 3
436 
437 #define MOLECULE_TYPE_GENOMIC 1
438 #define MOLECULE_TYPE_PRE_MRNA 2
439 #define MOLECULE_TYPE_MRNA 3
440 #define MOLECULE_TYPE_RRNA 4
441 #define MOLECULE_TYPE_TRNA 5
442 #define MOLECULE_TYPE_SNRNA 6
443 #define MOLECULE_TYPE_SCRNA 7
444 #define MOLECULE_TYPE_PEPTIDE 8
445 #define MOLECULE_TYPE_OTHER_GENETIC_MATERIAL 9
446 #define MOLECULE_TYPE_GENOMIC_MRNA_MIX 10
447 #define MOLECULE_TYPE_CRNA 11
448 #define MOLECULE_TYPE_SNORNA 12
449 #define MOLECULE_TYPE_TRANSCRIBED_RNA 13
450 #define MOLECULE_TYPE_NCRNA 14
451 #define MOLECULE_TYPE_TMRNA 15
452 
453 #define TOPOLOGY_LINEAR 1
454 #define TOPOLOGY_CIRCULAR 2
455 #define TOPOLOGY_TANDEM 3
456 
457 #define STRANDEDNESS_SINGLE 1
458 #define STRANDEDNESS_DOUBLE 2
459 
460 /******************************************************************
461 *
462 *   Fill in Bases or Amino Acids
463 *   	 1) You may call functions as often per bioseq as you like
464 *   		  up to the length of the Bioseq
465 *   	 2) All codes are iupac and defined in /ncbi/data/seqcode.prt
466 *   		  as an ASN.1 file used by this code. Excerpts at the
467 *   		  end of this file. Even though it's ASN.1 you will find
468 *   		  you can read it with no trouble.
469 *   	 3) IUPAC codes are UPPER CASE. These functions will upper
470 *   		  case for you.
471 *   	 4) In nucleic acids 'U' will be changed to 'T'
472 *   	 5) In both cases, non-letters will be stripped from the
473 *   		  the input strings to facilate input from external
474 *   		  formatted files with numbers and internal spaces and
475 *          such.
476 *
477 ******************************************************************/
478 
479 NLM_EXTERN Boolean AddBasesToBioseq (
480 	NCBISubPtr submission ,
481 	SeqEntryPtr the_seq ,
482 	CharPtr the_bases );
483 
484 NLM_EXTERN Boolean AddAminoAcidsToBioseq (
485 	NCBISubPtr submission ,
486 	SeqEntryPtr the_seq ,
487 	CharPtr the_aas );
488 
489            /** variant functions for Delta sequences ***/
490 
491 NLM_EXTERN Boolean AddBasesToLiteral (
492 	NCBISubPtr submission ,
493 	SeqLitPtr the_literal ,
494 	CharPtr the_bases );
495 
496 NLM_EXTERN Boolean AddAminoAcidsToLiteral (
497 	NCBISubPtr submission ,
498 	SeqLitPtr the_literal ,
499 	CharPtr the_aas );
500 
501 
502 /*****************************************************************************
503 *
504 *   Add Annotations to Entries
505 *
506 *****************************************************************************/
507 
508 NLM_EXTERN Boolean AddTitleToEntry (
509 	NCBISubPtr submission,
510 	SeqEntryPtr entry ,
511 	CharPtr title );
512 
513 NLM_EXTERN Boolean AddSecondaryAccnToEntry (
514 NCBISubPtr submission,
515 	SeqEntryPtr entry ,
516 	CharPtr accn );
517 
518 /*****************************************************************
519 *
520 *   rules for long comments
521 *     1) include no non-ascii characters (e.g. \t \r \n)
522 *     2) you may force a line feed on display by using tilde '~'
523 *     3) you format a table by adding leading spaces after a '~'
524 *     4) non-ascii chars will be converted on input (also for
525 *         title) \n='~', all others='#'
526 *
527 *****************************************************************/
528 
529 NLM_EXTERN Boolean AddCommentToEntry (
530 	NCBISubPtr submission,
531 	SeqEntryPtr entry ,
532 	CharPtr comment );
533 
534 NLM_EXTERN Boolean AddOrganismToEntryNew (
535 	NCBISubPtr submission,
536 	SeqEntryPtr entry ,
537 	CharPtr scientific_name ,
538 	CharPtr common_name ,
539 	CharPtr virus_name ,
540 	CharPtr strain ,
541 	CharPtr synonym1,
542 	CharPtr synonym2,
543 	CharPtr synonym3,
544 	CharPtr taxonomy );
545 
546            /** AddOrganismToEntryNew() defaults to universal code (0)
547            ** for both cytoplasmic and mitochondiral ribosomes. You
548            ** also supply the code when you create a CdRegion. If the
549            ** CdRegion code does not match the organism code, the
550            ** validator will warn, but will translate by CdRegion code.
551            ** if you need an alternate code, SetGeneticCodeForEntry()
552            ** can be used to eliminate the conflict. See table of genetic
553            ** codes at end of this file. You should call
554            ** AddOrganismToEntryNew() before calling
555            ** SetGeneticCodeForEntry() **/
556 
557 NLM_EXTERN Boolean AddOrganismToEntryEx (
558 	NCBISubPtr submission,
559 	SeqEntryPtr entry ,
560 	CharPtr scientific_name ,
561 	CharPtr common_name ,
562 	CharPtr virus_name ,
563 	CharPtr strain ,
564 	CharPtr synonym1,
565 	CharPtr synonym2,
566 	CharPtr synonym3,
567 	CharPtr taxonomy,
568 	Int4 taxid );
569 
570            /** AddOrganismToEntryEx() allows taxonID to be entered **/
571 
572 NLM_EXTERN Boolean SetGeneticCodeForEntry (
573 	NCBISubPtr submission,
574         SeqEntryPtr entry,
575         Uint1 genetic_code,  /* for cytoplasm */
576         Uint1 mito_code );   /* for mitochondria */
577 
578 
579 
580 /**************************************************
581 *  OBSOLETE!!! do not use. Use AddOrganismToEntryNew
582 *
583 **************************************************/
584 NLM_EXTERN Boolean AddOrganismToEntry (
585 	NCBISubPtr submission,
586 	SeqEntryPtr entry ,
587 	CharPtr scientific_name ,
588 	CharPtr common_name ,
589 	CharPtr virus_name ,
590 	CharPtr strain ,
591 	CharPtr synonym1,
592 	CharPtr synonym2,
593 	CharPtr synonym3);
594 
595 NLM_EXTERN Boolean AddGenBankBlockToEntry (
596 	NCBISubPtr submission,
597 	SeqEntryPtr entry ,
598 	CharPtr taxonomy ,
599 	CharPtr division ,
600 	CharPtr keyword1 ,
601 	CharPtr keyword2 ,
602 	CharPtr keyword3 );
603 
604 #define GENOME_unknown 0
605 #define GENOME_genomic 1
606 #define GENOME_chloroplast 2
607 #define GENOME_chromoplast 3
608 #define GENOME_kinetoplast 4
609 #define GENOME_mitochondrion 5
610 #define GENOME_plastid 6
611 #define GENOME_macronuclear 7
612 #define GENOME_extrachrom 8
613 #define GENOME_plasmid 9
614 #define GENOME_transposon 10
615 #define GENOME_insertion_seq 11
616 #define GENOME_cyanelle 12
617 #define GENOME_proviral 13
618 #define GENOME_virion 14
619 #define GENOME_nucleomorph 15
620 #define GENOME_apicoplast 16
621 #define GENOME_leucoplast 17
622 #define GENOME_proplastid 18
623 #define GENOME_endogenous_virus 19
624 #define GENOME_hydrogenosome 20
625 #define GENOME_chromosome 21
626 #define GENOME_chromatophore 22
627 #define GENOME_plasmid_in_mitochondrion 23
628 #define GENOME_plasmid_in_plastid 24
629 
630 /********************************************
631 *  Genome describes the type of genome from which the DNA or gene for
632 *   a protein is located. Values are:
633     genome INTEGER {             -- biological context
634         unknown (0) ,
635         genomic (1) ,
636         chloroplast (2) ,
637         chromoplast (3) ,
638         kinetoplast (4) ,
639         mitochondrion (5) ,
640         plastid (6) ,
641         macronuclear (7) ,
642         extrachrom (8) ,
643         plasmid (9) ,
644         transposon (10) ,
645         insertion-seq (11) ,
646         cyanelle (12) ,
647         proviral (13) ,
648         virion (14) } DEFAULT unknown ,
649          more types added, see GENOME_.. above
650 **********************************************/
651 
652 NLM_EXTERN Boolean AddGenomeToEntry (
653 	NCBISubPtr submission,
654 	SeqEntryPtr entry ,
655 	Int2	 type );
656 
657 #define SUBSRC_chromosome 1
658 #define SUBSRC_map 2
659 #define SUBSRC_clone 3
660 #define SUBSRC_subclone 4
661 #define SUBSRC_haplotype 5
662 #define SUBSRC_genotype 6
663 #define SUBSRC_sex 7
664 #define SUBSRC_cell_line 8
665 #define SUBSRC_cell_type 9
666 #define SUBSRC_tissue_type 10
667 #define SUBSRC_clone_lib 11
668 #define SUBSRC_dev_stage 12
669 #define SUBSRC_frequency 13
670 #define SUBSRC_germline 14
671 #define SUBSRC_rearranged 15
672 #define SUBSRC_lab_host 16
673 #define SUBSRC_pop_variant 17
674 #define SUBSRC_tissue_lib 18
675 #define SUBSRC_plasmid_name 19
676 #define SUBSRC_transposon_name 20
677 #define SUBSRC_insertion_seq_name 21
678 #define SUBSRC_plastid_name 22
679 #define SUBSRC_country 23
680 #define SUBSRC_segment 24
681 #define SUBSRC_endogenous_virus_name 25
682 #define SUBSRC_transgenic 26
683 #define SUBSRC_environmental_sample 27
684 #define SUBSRC_isolation_source 28
685 #define SUBSRC_lat_lon 29
686 #define SUBSRC_collection_date 30
687 #define SUBSRC_collected_by 31
688 #define SUBSRC_identified_by 32
689 #define SUBSRC_fwd_primer_seq 33
690 #define SUBSRC_rev_primer_seq 34
691 #define SUBSRC_fwd_primer_name 35
692 #define SUBSRC_rev_primer_name 36
693 #define SUBSRC_metagenomic 37
694 #define SUBSRC_mating_type 38
695 #define SUBSRC_linkage_group 39
696 #define SUBSRC_haplogroup 40
697 #define SUBSRC_whole_replicon 41
698 #define SUBSRC_phenotype 42
699 #define SUBSRC_altitude 43
700 #define SUBSRC_other 255
701 
702 /*********************************************
703 *  SubSource defines subclasses of source material
704 *    (also see OrgMod below for subclasses of organism names)
705 *
706 *  allowed values for type are:
707         chromosome (1) ,
708         map (2) ,
709         clone (3) ,
710         subclone (4) ,
711         haplotype (5) ,
712         genotype (6) ,
713         sex (7) ,
714         cell-line (8) ,
715         cell-type (9) ,
716         tissue-type (10) ,
717         clone-lib (11) ,
718         dev-stage (12) ,
719         frequency (13) ,
720         germline (14) ,
721         rearranged (15) ,
722         lab-host (16) ,
723         pop-variant (17) ,
724         tissue-lib (18) ,
725         plasmid-name (19) ,
726         transposon-name (20) ,
727         insertion-seq-name (21) ,
728         plastid-name (22) ,
729         country (23) ,
730         segment (24) ,
731         endogenous-virus-name (25) ,
732         transgenic (26) ,
733         environmental-sample (27) ,
734         isolation-source (28) ,
735         lat-lon (29) ,          -- +/- decimal degrees
736         collection-date (30) ,  -- DD-MMM-YYYY format
737         collected-by (31) ,     -- name of person who collected the sample
738         identified-by (32) ,    -- name of person who identified the sample
739         fwd-primer-seq (33) ,   -- sequence (possibly more than one; semicolon-separated)
740         rev-primer-seq (34) ,   -- sequence (possibly more than one; semicolon-separated)
741         fwd-primer-name (35) ,
742         rev-primer-name (36) ,
743         metagenomic (37) ,
744         mating-type (38) ,
745         linkage-group (39) ,
746         haplogroup (40) ,
747         whole-replicon (41) ,
748         phenotype (42) ,
749         altitude (43) ,
750         other (255) } ,
751 
752 *   value is an optional string to give the name (eg. of the
753 *     clone)
754 ******************************************/
755 NLM_EXTERN Boolean AddSubSourceToEntry (
756 	NCBISubPtr submission,
757 	SeqEntryPtr entry ,
758 	Int2	 type ,
759 	CharPtr value);
760 
761 #define ORGMOD_strain 2
762 #define ORGMOD_substrain 3
763 #define ORGMOD_type 4
764 #define ORGMOD_subtype 5
765 #define ORGMOD_variety 6
766 #define ORGMOD_serotype 7
767 #define ORGMOD_serogroup 8
768 #define ORGMOD_serovar 9
769 #define ORGMOD_cultivar 10
770 #define ORGMOD_pathovar 11
771 #define ORGMOD_chemovar 12
772 #define ORGMOD_biovar 13
773 #define ORGMOD_biotype 14
774 #define ORGMOD_group 15
775 #define ORGMOD_subgroup 16
776 #define ORGMOD_isolate 17
777 #define ORGMOD_common 18
778 #define ORGMOD_acronym 19
779 #define ORGMOD_dosage 20
780 #define ORGMOD_nat_host 21
781 #define ORGMOD_sub_species 22
782 #define ORGMOD_specimen_voucher 23
783 #define ORGMOD_authority 24
784 #define ORGMOD_forma 25
785 #define ORGMOD_forma_specialis 26
786 #define ORGMOD_ecotype 27
787 #define ORGMOD_synonym 28
788 #define ORGMOD_anamorph 29
789 #define ORGMOD_teleomorph 30
790 #define ORGMOD_breed 31
791 #define ORGMOD_gb_acronym 32
792 #define ORGMOD_gb_anamorph 33
793 #define ORGMOD_gb_synonym 34
794 #define ORGMOD_culture_collection 35
795 #define ORGMOD_bio_material 36
796 #define ORGMOD_metagenome_source 37
797 #define ORGMOD_type_material 38
798 #define ORGMOD_old_lineage 253
799 #define ORGMOD_old_name 254
800 #define ORGMOD_other 255
801 
802 /* Defines for BioSrc.origin
803  */
804 #define ORG_UNKNOWN 0
805 #define ORG_NATURAL 1
806 #define ORG_NATMUT 2
807 #define ORG_MUT 3
808 #define ORG_ARTIFICIAL 4
809 #define ORG_SYNTHETIC 5
810 #define ORG_OTHER 255
811 #define ORG_DEFAULT ORG_UNKNOWN
812 
813 #define IS_ORG_UNKNOWN(S) ((S).origin == ORG_UNKNOWN)
814 #define IS_ORG_NATURAL(S) ((S).origin == ORG_NATURAL)
815 #define IS_ORG_NATMUT(S) ((S).origin == ORG_NATMUT)
816 #define IS_ORG_MUT(S) ((S).origin == ORG_MUT)
817 #define IS_ORG_ARTIFICIAL(S) ((S).origin == ORG_ARTIFICIAL)
818 #define IS_ORG_SYNTHETIC(S) ((S).origin == ORG_SYNTHETIC)
819 #define IS_ORG_OTHER(S) ((S).origin == ORG_OTHER)
820 
821 
822 /*********************************************
823 *  OrgMod defines subclasses of organism names
824 *    (also see SubSource above for subclasses of source material)
825 *
826 *  allowed values for type are:
827         strain (2) ,
828         substrain (3) ,
829         type (4) ,
830         subtype (5) ,
831         variety (6) ,
832         serotype (7) ,
833         serogroup (8) ,
834         serovar (9) ,
835         cultivar (10) ,
836         pathovar (11) ,
837         chemovar (12) ,
838         biovar (13) ,
839         biotype (14) ,
840         group (15) ,
841         subgroup (16) ,
842         isolate (17) ,
843         common (18) ,
844         acronym (19) ,
845         dosage (20) ,          -- chromosome dosage of hybrid
846         nat-host (21) ,        -- natural host of this specimen
847         sub-species (22) ,
848         specimen-voucher (23) ,
849         authority (24) ,
850         forma (25) ,
851         forma-specialis (26) ,
852         ecotype (27) ,
853         synonym (28) ,
854         anamorph (29) ,
855         teleomorph (30) ,
856         breed (31) ,
857         gb-acronym (32) ,       -- used by taxonomy database
858         gb-anamorph (33) ,      -- used by taxonomy database
859         gb-synonym (34) ,       -- used by taxonomy database
860         culture-collection (35) ,
861         bio-material (36) ,
862         metagenome-source (37) ,
863         old-lineage (253) ,
864         old-name (254) ,
865         other (255) } ,         -- ASN5: old-name (254) will be added to next spec
866 
867 *   value is an optional string to give the name (eg. of the
868 *     varient)
869 ******************************************/
870 NLM_EXTERN Boolean AddOrgModToEntry (
871 	NCBISubPtr submission,
872 	SeqEntryPtr entry ,
873 	Int2	 type ,
874 	CharPtr value);
875 
876 /********************************************
877 *  Biomol describes the biological type of the molecule
878 *   current values are:
879     biomol INTEGER {
880         unknown (0) ,
881         genomic (1) ,
882         pre-RNA (2) ,              -- precursor RNA of any sort really
883         mRNA (3) ,
884         rRNA (4) ,
885         tRNA (5) ,
886         snRNA (6) ,
887         scRNA (7) ,
888         peptide (8) ,
889         other-genetic (9) ,      -- other genetic material
890         genomic-mRNA (10) , -- reported a mix of genomic and cdna sequence
891         other (255) } DEFAULT unknown ,
892 ********************************************/
893 NLM_EXTERN Boolean AddBiomolToEntry (
894 	NCBISubPtr submission,
895 	SeqEntryPtr entry ,
896 	Int2	 type );
897 
898 /********************************************
899 *
900 *  What technique was used to get this sequence ?
901 *    There are a set of defines in objpubd.h for this:
902 *    Current list is:
903 #define MI_TECH_unknown 0
904 #define MI_TECH_standard 1
905 #define MI_TECH_est 2         EST division
906 #define MI_TECH_sts 3         STS division
907 #define MI_TECH_survey 4      GSS division
908 #define MI_TECH_genemap 5     Bioseq is a genetic map
909 #define MI_TECH_physmap 6     Bioseq is physical map
910 #define MI_TECH_derived 7     Bioseq is a computed inference
911 #define MI_TECH_concept_trans 8   conceptual translation
912 #define MI_TECH_seq_pept 9        peptide sequencing used
913 #define MI_TECH_both 10           combination of 8 and 9 used
914 #define MI_TECH_seq_pept_overlap 11  peptides ordered by overlap
915 #define MI_TECH_seq_pept_homol 12    peptides ordered by homology
916 #define MI_TECH_concept_trans_a 13   concept trans supplied by author
917 #define MI_TECH_other 255            doesnt' fit anything
918 ***************************************
919 * The following are not explicitly in the ASN.1 spec yet
920 * but can still be legally used as numbers.
921 * These are for High Throughput Genome Sequences
922 * htgs_1  - preliminary data. sequence is made of multiple
923 *             contigs with gaps between them. The order of
924 *             the contigs is not known, although for
925 *             convenience they are in an arbitrary order
926 * htgs_2  - preliminary data. like htgs_1 except the
927 *             order of the contigs is known and the sequence
928 *             reflects the correct order
929 * htgs_3  - finished data. All annotations are machine
930 *             generated in bulk. Usually this has been placed
931 *             on a map
932 *
933 ******************************************
934 #define MI_TECH_htgs_1 14
935 #define MI_TECH_htgs_2 15
936 #define MI_TECH_htgs_3 16
937 **********************************************************/
938 NLM_EXTERN Boolean AddTechToEntry (
939 	NCBISubPtr submission,
940 	SeqEntryPtr entry ,
941 	Int2	 tech );
942 
943 /********************************************
944 *  How complete is the molecule?
945 *   here are the allowed values:
946 *
947     completeness INTEGER {
948       unknown (0) ,
949       complete (1) ,                   -- complete biological entity
950       partial (2) ,                    -- partial but no details given
951       no-left (3),                     -- KNOWN missing 5' or NH3 end
952       no-right (4) ,                   -- KNOWN missing 3' or COOH end
953       no-ends (5) ,                    -- KNOWN missing both ends
954       has-left (6) ,                   -- KNOWN has complete 5' or NH3 end
955       has-right (7) ,                  -- KNOWN has complete 3' or COOH end
956       other (255) } DEFAULT unknown }
957 
958 *******************************************/
959 NLM_EXTERN Boolean AddCompleteToEntry (
960 	NCBISubPtr submission,
961 	SeqEntryPtr entry ,
962 	Int2	 complete );
963 
964 NLM_EXTERN void AddCompleteness(NCBISubPtr submission, SeqEntryPtr sep, SeqFeatPtr sfp);
965 
966 /**** OBSOLETE!!!! ***********************************************
967 *     DO NOT USE GIBBmethod
968 *     this is subsumed into AddTechToEntry, above
969 *
970 ****************************************************************/
971 
972 NLM_EXTERN Boolean AddGIBBmethodToEntry (
973 	NCBISubPtr submission,
974 	SeqEntryPtr entry ,
975 	Int2	 method );
976 
977 #define METHOD_concept_transl 1
978 #define METHOD_seq_pept 2
979 #define METHOD_both 3
980 #define METHOD_seq_pept_overlap 4
981 #define METHOD_seq_pept_homol 5
982 #define METHOD_concept_transl_a 6
983 #define METHOD_other 255
984 
985 NLM_EXTERN Boolean AddCreateDateToEntry (
986 	NCBISubPtr submission,
987 	SeqEntryPtr entry ,
988 	Int2 month ,
989 	Int2 day ,
990 	Int2 year );
991 
992 /*************************************************************************
993 *
994 *   Modifiers modify the meaning of all entries in the set or sequence
995 *   to which they are applied. This is particularly important for
996 *   indicating organelle sequences, RNA genomes, or mutants.
997 *
998 *   Less obvious is indicating completness.
999 *
1000 *   A genomic sequence is assumed to be partial unless the "complete"
1001 *      modifier is used.
1002 *   A peptide sequence is assumed to be complete unless the "partial"
1003 *      modifier is used.
1004 *   A cDNA is assumed to be complete (as well as one can tell) unless
1005 *      "partial" is used.
1006 *
1007 *   A genomic sequence is assumed to be nuclear unless the "mitochondrial"
1008 *      (or other organelle) modifier is used.
1009 *   All sequences are assumed to be natural unless "synthetic",
1010 *      "recombinant", or "mutagen" are used.
1011 *
1012 *************************************************************************/
1013 
1014 /***************************************
1015 *  Adds a ValNode of the appropriate type
1016 *    to the SeqEntry
1017 *    Note that caller must still create the
1018 *     specific descriptor structure and attach it to
1019 *     the returned ValNode
1020 *
1021 ****************************************/
1022 
1023 NLM_EXTERN ValNodePtr NewDescrOnSeqEntry (SeqEntryPtr entry, Int2 type);
1024 
1025 NLM_EXTERN ValNodePtr GetDescrOnSeqEntry (
1026 	SeqEntryPtr entry,
1027 	Int2 type);
1028 
1029 NLM_EXTERN Boolean AddModifierToEntry (
1030 	NCBISubPtr submission,
1031 	SeqEntryPtr entry ,
1032 	Int2 modifier );
1033 
1034 #define MODIF_dna  0
1035 #define MODIF_rna  1
1036 #define MODIF_extrachrom  2
1037 #define MODIF_plasmid  3
1038 #define MODIF_mitochondrial  4
1039 #define MODIF_chloroplast  5
1040 #define MODIF_kinetoplast  6
1041 #define MODIF_cyanelle  7
1042 #define MODIF_synthetic  8		/* synthetic sequence */
1043 #define MODIF_recombinant  9	/* recombinant construct */
1044 #define MODIF_partial  10
1045 #define MODIF_complete  11
1046 #define MODIF_mutagen  12 /* subject of mutagenesis ? */
1047 #define MODIF_natmut  13  /* natural mutant ? */
1048 #define MODIF_transposon  14
1049 #define MODIF_insertion_seq  15
1050 #define MODIF_no_left  16 /* missing left end (5' for na, NH2 for aa) */
1051 #define MODIF_no_right  17   /* missing right end (3' or COOH) */
1052 #define MODIF_macronuclear  18
1053 #define MODIF_proviral  19
1054 #define MODIF_est  20    /* expressed sequence tag */
1055 
1056 
1057 	                           /*** add/build publications ***/
1058 NLM_EXTERN Boolean AddPubToEntry (
1059 	NCBISubPtr submission,
1060 	SeqEntryPtr entry ,
1061 	PubPtr pub );
1062 
1063 NLM_EXTERN PubPtr CitSubBuild (               /* for first data submission **/
1064 	NCBISubPtr submission,
1065 	Int2 month,
1066 	Int2 day,
1067 	Int2 year,
1068 	Int2 medium );
1069 
1070 
1071 NLM_EXTERN PubPtr CitSubUpdateBuild (   /* for updates to existing record */
1072 	NCBISubPtr submission,
1073 	Int2 month,
1074 	Int2 day,
1075 	Int2 year ,
1076 	Int2 medium ,
1077 	CharPtr descr );  /* description of update, make it short */
1078 
1079 #define MEDIUM_NOT_SET 0
1080 #define MEDIUM_PAPER 1
1081 #define MEDIUM_TAPE 2
1082 #define MEDIUM_FLOPPY 3
1083 #define MEDIUM_EMAIL 4
1084 #define MEDIUM_OTHER 255
1085 
1086 NLM_EXTERN PubPtr CitArtBuild (
1087 	NCBISubPtr submission,
1088 	CharPtr title ,
1089 	CharPtr journal ,
1090 	CharPtr volume ,
1091 	CharPtr issue ,
1092 	CharPtr pages ,
1093 	Int2 month ,
1094 	Int2 day ,
1095 	Int2 year ,
1096 	Int2 status );
1097 
1098 #define PUB_STATUS_PUBLISHED 0
1099 #define PUB_STATUS_SUBMITTED 1
1100 #define PUB_STATUS_IN_PRESS  2
1101 #define PUB_STATUS_UNPUBLISHED 3
1102 
1103 /*************************************************************************
1104 *
1105 *   Author names can be given in various forms
1106 *   	You MUST give at least a last name
1107 *   	You should give at least first name or initials.
1108 *       Initials are just for first and middle names, and are
1109 *         separated by periods.
1110 *
1111 *   example: John Q. Public
1112 *   last_name = "Public"
1113 *   first_name = "John"
1114 *   middle_name = NULL
1115 *   initials = "J.Q."
1116 *
1117 *************************************************************************/
1118 
1119 
1120 NLM_EXTERN Boolean AddAuthorToPub (    /* call once for each author, in order */
1121 	NCBISubPtr submission,
1122 	PubPtr the_pub,
1123 	CharPtr last_name,
1124 	CharPtr first_name,
1125 	CharPtr middle_name,
1126 	CharPtr initials,  /* separated by periods, no initial for last name */
1127 	CharPtr suffix );  /* Jr. Sr. III */
1128 
1129 
1130 /*************************************************************************
1131 *
1132 *   Author Affiliation
1133 *      only one allowed per pub (one per author is also possible, but is
1134 *      not supported by this interface )
1135 *
1136 *   affil = institutional affiliation
1137 *   div   = division of institution
1138 *   street = street address
1139 *   city = city
1140 *   sub = subdivision of country (e.g. state.. optional)
1141 *   country = country
1142 *   postal_code = zip code in the USA
1143 *
1144 *************************************************************************/
1145 
1146 
1147 NLM_EXTERN Boolean AddAffiliationToPub (  /* call once per pub */
1148 	NCBISubPtr submission,
1149 	PubPtr the_pub,
1150 	CharPtr affil,        /* e.g. "Xyz University" */
1151 	CharPtr div,          /* e.g. "Dept of Biology" */
1152 	CharPtr street,       /* e.g. "123 Academic Road" */
1153 	CharPtr city,         /* e.g. "Metropolis" */
1154 	CharPtr sub,          /* e.g. "Massachusetts" */
1155 	CharPtr country ,     /* e.g. "USA" */
1156 	CharPtr postal_code ); /* e.g."02133" */
1157 
1158 
1159 /*****************************************************************************
1160 *
1161 *   Add Features to the entry
1162 *   	Add location to feature
1163 *   	Add info for specific types to feature
1164 *
1165 *****************************************************************************/
1166 NLM_EXTERN SeqFeatPtr FeatureBuild (
1167 	NCBISubPtr submission,
1168 	SeqEntryPtr entry_to_put_feature,
1169 	Boolean feature_is_partial,
1170 	Uint1 evidence_is_experimental,
1171 	Boolean biological_exception,
1172 	CharPtr comment );
1173 
1174 #define EVIDENCE_NOT_SET 0
1175 #define EVIDENCE_EXPERIMENTAL 1
1176 #define EVIDENCE_NOT_EXPERIMENTAL 2
1177 
1178 /*************************************************************************
1179 *
1180 *   About feature locations:
1181 *     Internally the NCBI software represents locations on sequence as
1182 *	offsets from the start of the sequence (i.e. from 0 - (length -1)).
1183 *   Also, the "from" position is always <= "to", even for locations on
1184 *   the minus strand. Finally, no location can cross the origin of a
1185 *   circular sequence.. it must be split in two. This makes routines
1186 *   that access locations very consistent and easy to write.
1187 *
1188 *     However, most biologists number sequences starting with 1, not 0.
1189 *   It is natural to think of a coding region on the minus strand going
1190 *   from 5243 to 2993. And it is not unusual to think of the origin of
1191 *   replication being from 5344 to 10 on the plus strand of a circular
1192 *   sequence.
1193 *
1194 *     AddIntervalToFeature and AddPointToFeature were written to support
1195 *   the biological notion. They convert to the internal format
1196 *   automatically. So, for these two functions:
1197 *
1198 *	1) numbers are in the range 1 - length
1199 *   2) from <= to on plus strand
1200 *      to <= from on minus strand
1201 *   3) numbers not conforming to (2) are assumed to go around the origin
1202 *      of a circular sequence. It is an error on a linear sequence.
1203 *   4) Intervals should be added in biological order (e.g. exon1, exon2,
1204 *      exon3...) no matter which strand the feature is on.
1205 *   5) You must always indicate explicitly the Bioseq the interval is
1206 *      on. You may either pass in the SeqEntryPtr or the local_name you
1207 *      used when you created the sequence. The sequence must have
1208 *      been previously created with AddSeqTo...	 If you give both the
1209 *      SeqEntryPtr and the local_name, they must agree.
1210 *   6) -1 (minus one) is a short hand for "end of sequence". To indicate
1211 *      the whole sequence you can give from = 1, to = -1
1212 *
1213 *************************************************************************/
1214 
1215 NLM_EXTERN Boolean AddIntervalToFeature (
1216 	NCBISubPtr submission,
1217 	SeqFeatPtr sfp,
1218 	SeqEntryPtr the_seq ,
1219 	CharPtr local_name ,
1220 	Int4 from ,
1221 	Int4 to ,
1222 	Boolean on_plus_strand ,
1223 	Boolean start_before_from ,
1224 	Boolean stop_after_to );
1225 
1226 NLM_EXTERN Boolean AddIntToSeqLoc (
1227   SeqLocPtr PNTR old_slp,
1228   Int4 from,
1229   Int4 to,
1230   SeqIdPtr sip,
1231   Int2 fuzz_from,
1232   Int2 fuzz_to,
1233   Int2 strand);
1234 
1235 NLM_EXTERN Boolean AddIntToSeqFeat (
1236 	SeqFeatPtr sfp,
1237 	Int4 from,
1238 	Int4 to,
1239 	BioseqPtr bsp,
1240 	Int2 fuzz_from,
1241 	Int2 fuzz_to,
1242 	Int2 strand);
1243 
1244 NLM_EXTERN Boolean AddPointToFeature (
1245 	NCBISubPtr submission,
1246 	SeqFeatPtr sfp,
1247 	SeqEntryPtr the_seq ,
1248 	CharPtr local_name ,
1249 	Int4 location ,
1250 	Boolean on_plus_strand ,
1251 	Boolean is_after_location ,
1252 	Boolean is_before_location );
1253 
1254 NLM_EXTERN Boolean AddPntToSeqLoc (
1255 	SeqLocPtr PNTR p_slp,
1256 	Int4 point,
1257 	BioseqPtr bsp,
1258 	Int2 fuzz,
1259 	Int2 strand);
1260 
1261 NLM_EXTERN Boolean AddPntToSeqFeat (
1262 	SeqFeatPtr sfp,
1263 	Int4 point,
1264 	BioseqPtr bsp,
1265 	Int2 fuzz,
1266 	Int2 strand);
1267 
1268 /*************************************************************************
1269 *
1270 *   Having made a generalized feature, now add type specific info to it.
1271 *
1272 *************************************************************************/
1273 
1274 /************************************************************
1275 *
1276 *  A comment is the simplest feature. It is required that you
1277 *  supplied a "comment" argument to FeatureBuild. In GenBank format
1278 *  it will appear as misc_feat, with the comment appearing as the
1279 *  \note.
1280 ***************************************************************/
1281 
1282 NLM_EXTERN Boolean MakeCommentFeature (
1283 	NCBISubPtr submission,
1284 	SeqFeatPtr feature );
1285 
1286 /*****************************************************************
1287 *
1288 *   This connects a protein sequence with the nucleic acid
1289 *   region which codes for it. So the protein is given as an
1290 *   argument, as well as adding intervals on the nucleic acid.
1291 *   A complete coding region includes the initial Met codon and
1292 *   the final termination codon.
1293 *
1294 *****************************************************************/
1295 
1296 
1297 NLM_EXTERN Boolean MakeCdRegionFeature (
1298 	NCBISubPtr submission,
1299 	SeqFeatPtr feature,
1300 	Int2 frame ,
1301 	Int2 genetic_code ,	   /* see end of this file for genetic codes */
1302 	SeqEntryPtr protein_product,	/* give id of protein. if NULL, call */
1303 	CharPtr local_id_for_protein); /* function below to create by transl */
1304 
1305 
1306 /******************************************************************
1307 *
1308 *   A Code-break allows an exception to be made in the translation
1309 *    of a particular codon. You must give positions of the first
1310 *    and last bases of the codon in the DNA sequence and the amino
1311 *    acid to place there, instead of the normal translation. This
1312 *    should be used sparingly, and a comment on the feature should
1313 *    explain why it was done.
1314 *
1315 *   The location is specified the same as in AddIntervalToFeature.
1316 *   AA_for_protein points the amino acid to use, in ncbieaa code.
1317 *
1318 ******************************************************************/
1319 
1320 NLM_EXTERN Boolean AddCodeBreakToCdRegion (
1321 	NCBISubPtr submission,
1322 	SeqFeatPtr sfp,
1323 	SeqEntryPtr the_seq ,
1324 	CharPtr local_name ,
1325 	Int4 from ,
1326 	Int4 to ,
1327 	Boolean on_plus_strand ,
1328 	CharPtr AA_for_protein );
1329 
1330 
1331 /******************************************************************
1332 *
1333 *   Special function to make protein from CdRegion feature
1334 *
1335 ******************************************************************/
1336 
1337 NLM_EXTERN SeqEntryPtr TranslateCdRegion (
1338 	NCBISubPtr submission ,
1339 	SeqFeatPtr cdregion_feature ,
1340 	SeqEntryPtr nuc_prot_entry_to_put_sequence ,
1341 	CharPtr local_name ,             /* for protein sequence */
1342 	CharPtr genbank_locus ,
1343 	CharPtr genbank_accession ,
1344 	Int4 gi_number );
1345 
1346 NLM_EXTERN Boolean MakeRNAFeature (
1347 	NCBISubPtr submission,
1348 	SeqFeatPtr feature,
1349 	Int2 rna_type ,
1350 	Boolean is_pseudo_gene,
1351 	CharPtr rna_name ,
1352 	CharPtr AA_for_tRNA ,
1353 	CharPtr codon_for_tRNA );
1354 
1355 #define RNA_TYPE_premsg 1
1356 #define RNA_TYPE_mRNA   2
1357 #define RNA_TYPE_tRNA   3
1358 #define RNA_TYPE_rRNA   4
1359 #define RNA_TYPE_snRNA  5
1360 #define RNA_TYPE_scRNA  6
1361 #define RNA_TYPE_snoRNA 7
1362 #define RNA_TYPE_ncRNA  8
1363 #define RNA_TYPE_tmRNA  9
1364 #define RNA_TYPE_misc_RNA 10
1365 #define RNA_TYPE_other  255
1366 
1367 /******************************************************************
1368 *
1369 *  Once you have made a tRNA feature, you may optionally add the
1370 *   the location of the anticodon if you know it. This should be
1371 *   within the range of the tRNA feature already created, obviously.
1372 *
1373 *   the location is specified on the DNA the same as for
1374 *     AddIntervalToFeature
1375 *
1376 ******************************************************************/
1377 
1378 NLM_EXTERN Boolean AddAntiCodonTotRNA (
1379 	NCBISubPtr submission,
1380 	SeqFeatPtr sfp,
1381 	SeqEntryPtr the_seq ,
1382 	CharPtr local_name ,
1383 	Int4 from ,
1384 	Int4 to ,
1385 	Boolean on_plus_strand );
1386 
1387 NLM_EXTERN Boolean MakeGeneFeature (
1388 	NCBISubPtr submission,
1389 	SeqFeatPtr feature,
1390 	CharPtr gene_symbol_for_locus ,
1391 	CharPtr allele ,
1392 	CharPtr descriptive_name ,
1393 	CharPtr map_location ,
1394 	Boolean is_pseudo_gene ,
1395 	CharPtr genetic_database ,
1396 	CharPtr gene_id_in_genetic_database ,
1397 	CharPtr synonym1 ,
1398 	CharPtr synonym2 ,
1399 	CharPtr synonym3 );
1400 
1401 NLM_EXTERN Boolean MakeProteinFeature (
1402 	NCBISubPtr submission,
1403 	SeqFeatPtr feature ,
1404 	CharPtr protein_name1,
1405 	CharPtr protein_name2,
1406 	CharPtr protein_name3,
1407 	CharPtr descriptive_name,
1408 	CharPtr ECnum1,
1409 	CharPtr ECnum2,
1410 	CharPtr activity1,
1411 	CharPtr activity2,
1412 	CharPtr protein_database,
1413 	CharPtr id_in_protein_database);
1414 
1415 NLM_EXTERN Boolean MakeRegionFeature (
1416 	NCBISubPtr submission,
1417 	SeqFeatPtr feature ,
1418 	CharPtr region_name );
1419 
1420 NLM_EXTERN Boolean MakeSiteFeature (
1421 	NCBISubPtr submission,
1422 	SeqFeatPtr feature ,
1423 	Int2 site_type );
1424 
1425 NLM_EXTERN Boolean MakeImpFeature (
1426 	NCBISubPtr submission,
1427 	SeqFeatPtr feature ,
1428 	CharPtr key );
1429 
1430 NLM_EXTERN Boolean AddQualToImpFeature (
1431 	NCBISubPtr submission,
1432 	SeqFeatPtr imp_feature ,
1433 	CharPtr qualifier ,
1434 	CharPtr value );
1435 
1436 NLM_EXTERN Boolean MakePubFeature (
1437 	NCBISubPtr submission,
1438 	SeqFeatPtr feature,
1439 	PubPtr pub );
1440 
1441 NLM_EXTERN Boolean AddBasesToByteStore (ByteStorePtr bsp, CharPtr the_bases);
1442 
1443 NLM_EXTERN Boolean AddAAsToByteStore (ByteStorePtr bsp, CharPtr the_aas);
1444 
1445 /*****************************************************************************
1446 *
1447 *   AddPhrapGraph (submission, the_seq, local_name, phrap_values)
1448 *   	Converts phrap byte array to SeqGraph, wraps in SeqAnnot, adds to Bioseq.
1449 *       The length of data in the array must be equal to the length of the Bioseq.
1450 *
1451 *****************************************************************************/
1452 
1453 NLM_EXTERN Boolean AddPhrapGraph (
1454 	NCBISubPtr submission,
1455 	SeqEntryPtr the_seq ,
1456 	CharPtr local_name ,
1457 	BytePtr phrap_values );
1458 
1459 NLM_EXTERN Boolean AddPhrapGraphToSeqLit (
1460 	NCBISubPtr submission,
1461 	SeqLitPtr slp ,
1462 	BytePtr phrap_values );
1463 
1464 /* internal functions for reference gene project */
1465 NLM_EXTERN UserObjectPtr CreateRefGeneTrackUserObject (void);
1466 NLM_EXTERN void AddStatusToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr status);
1467 NLM_EXTERN void AddGeneratedToRefGeneTrackUserObject (UserObjectPtr uop, Boolean generated);
1468 NLM_EXTERN void AddCuratorToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr collaborator);
1469 NLM_EXTERN void AddCuratorURLToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr url);
1470 NLM_EXTERN void AddSourceToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr genomicSource);
1471 NLM_EXTERN void AddAccessionToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr field,
1472                                                       CharPtr accn, Int4 gi, Int4 from,
1473                                                       Int4 to, CharPtr comment);
1474 
1475 /* experimental function to associate mRNA with protein product in cases of alt splicing */
1476 NLM_EXTERN UserObjectPtr CreateMrnaProteinLinkUserObject (BioseqPtr protbsp);
1477 
1478 /* vector screen, validator count, general submission comment user object (JP) */
1479 NLM_EXTERN UserObjectPtr CreateSubmissionUserObject (CharPtr univecComment,
1480                                                      CharPtr additionalComment,
1481                                                      Int4 validatorErrorCount,
1482                                                      Int4 validatorHashCode,
1483                                                      Boolean isCloningVector);
1484 
1485 /* clone name and ID for genomic contig RefSeq records */
1486 NLM_EXTERN UserObjectPtr CreateContigCloneUserObject (CharPtr name, Int4 ID);
1487 
1488 /* gene ontology process, component, and function user object */
1489 NLM_EXTERN UserObjectPtr CreateGeneOntologyUserObject (
1490   void
1491 );
1492 NLM_EXTERN void AddToGeneOntologyUserObject (
1493   UserObjectPtr uop,
1494   CharPtr type,
1495   CharPtr text,
1496   CharPtr goid,
1497   Int4 pmid,
1498   CharPtr goref,
1499   CharPtr evidence
1500 );
1501 
1502 /* model evidence user object */
1503 NLM_EXTERN UserObjectPtr CreateModelEvidenceUserObject (
1504   CharPtr method,
1505   CharPtr contigParent
1506 );
1507 NLM_EXTERN void AddMrnaOrESTtoModelEvidence (
1508   UserObjectPtr uop,
1509   CharPtr type,
1510   CharPtr accn,
1511   Int4 length,
1512   Int4 gaplen
1513 );
1514 NLM_EXTERN UserFieldPtr FindModelEvidenceField (
1515   UserObjectPtr uop,
1516   CharPtr type
1517 );
1518 
1519 /* third party accession list user object manipulation */
1520 NLM_EXTERN UserObjectPtr CreateTpaAssemblyUserObject (
1521   void
1522 );
1523 NLM_EXTERN UserFieldPtr CreateTPAAssemblyAccessionField (CharPtr accn);
1524 NLM_EXTERN UserFieldPtr CreateTPAAssemblyFromField (Int4 from);
1525 NLM_EXTERN UserFieldPtr CreateTPAAssemblyToField (Int4 to);
1526 
1527 NLM_EXTERN void AddAccessionToTpaAssemblyUserObject (
1528   UserObjectPtr uop,
1529   CharPtr accn,
1530   Int4 from,
1531   Int4 to
1532 );
1533 
1534 NLM_EXTERN UserObjectPtr CreateGenomeProjectsDBUserObject (
1535   void
1536 );
1537 NLM_EXTERN UserObjectPtr AddIDsToGenomeProjectsDBUserObject (
1538   UserObjectPtr uop,
1539   Int4 projectID,
1540   Int4 parentID
1541 );
1542 
1543 /* annot desc comment policy user object */
1544 NLM_EXTERN UserObjectPtr CreateAnnotDescCommentPolicyUserObject (
1545   Boolean showInCommentBlock
1546 );
1547 
1548 /* feature fetch policy user object */
1549 
1550 NLM_EXTERN UserObjectPtr CreateFeatureFetchPolicyUserObject (
1551   CharPtr policy
1552 );
1553 
1554 /* structured comment user object for flatfile presentation */
1555 
1556 NLM_EXTERN UserObjectPtr CreateStructuredCommentUserObject (
1557   CharPtr prefix,
1558   CharPtr suffix
1559 );
1560 
1561 NLM_EXTERN void AddItemStructuredCommentUserObject (
1562   UserObjectPtr uop,
1563   CharPtr field,
1564   CharPtr str
1565 );
1566 
1567 /* DBLink user object for flatfile presentation */
1568 
1569 NLM_EXTERN UserObjectPtr CreateDBLinkUserObject (
1570   void
1571 );
1572 
1573 NLM_EXTERN void AddIntListFieldToDBLinkUserObject (
1574   UserObjectPtr uop,
1575   Int4 num,
1576   Int4Ptr values,
1577   CharPtr field_name
1578 );
1579 
1580 NLM_EXTERN void AddTraceAssemblyIDsToDBLinkUserObject (
1581   UserObjectPtr uop,
1582   Int4 num,
1583   Int4Ptr values
1584 );
1585 
1586 NLM_EXTERN void AddStringListFieldToDBLinkUserObject (
1587   UserObjectPtr uop,
1588   Int4 num,
1589   CharPtr PNTR values,
1590   CharPtr field_name
1591 );
1592 
1593 NLM_EXTERN void AddBioSampleIDsToDBLinkUserObject (
1594   UserObjectPtr uop,
1595   Int4 num,
1596   CharPtr PNTR values
1597 );
1598 
1599 NLM_EXTERN void AddSeqReadArchIDsToDBLinkUserObject (
1600   UserObjectPtr uop,
1601   Int4 num,
1602   CharPtr PNTR values
1603 );
1604 
1605 NLM_EXTERN void AddProbeDBIDsToDBLinkUserObject (
1606   UserObjectPtr uop,
1607   Int4 num,
1608   CharPtr PNTR values
1609 );
1610 
1611 NLM_EXTERN void AddSeqReadArchiveIDsToDBLinkUserObject (
1612   UserObjectPtr uop,
1613   Int4 num,
1614   CharPtr PNTR values
1615 );
1616 
1617 NLM_EXTERN void AddBioProjectIDsToDBLinkUserObject (
1618   UserObjectPtr uop,
1619   Int4 num,
1620   CharPtr PNTR values
1621 );
1622 
1623 /* NcbiCleanup user object for SeriousSeqEntryCleanup time/version stamp */
1624 
1625 NLM_EXTERN UserObjectPtr CreateNcbiCleanupUserObject (
1626   void
1627 );
1628 
1629 NLM_EXTERN void AddStringToNcbiCleanupUserObject (
1630   UserObjectPtr uop,
1631   CharPtr field,
1632   CharPtr str
1633 );
1634 
1635 NLM_EXTERN void AddIntegerToNcbiCleanupUserObject (
1636   UserObjectPtr uop,
1637   CharPtr field,
1638   Int4 num
1639 );
1640 
1641 /* FindNcbiCleanupUserObject returns user object on top Seq-entry */
1642 
1643 NLM_EXTERN UserObjectPtr FindNcbiCleanupUserObject (
1644   SeqEntryPtr sep
1645 );
1646 
1647 NLM_EXTERN void RemoveAllNcbiCleanupUserObjects (
1648   SeqEntryPtr sep
1649 );
1650 
1651 /* Also can put NcbiCleanupUserObject on Seq-annot Annot-desc */
1652 
1653 NLM_EXTERN UserObjectPtr FindSeqAnnotCleanupUserObj (
1654   SeqAnnotPtr sap
1655 );
1656 
1657 NLM_EXTERN void RemoveAllSeqAnnotCleanupUserObjs (
1658   SeqAnnotPtr sap
1659 );
1660 
1661 NLM_EXTERN UserObjectPtr FindNcbiAutofixUserObject (
1662   SeqEntryPtr sep
1663 );
1664 
1665 NLM_EXTERN void AddNcbiAutofixUserObject (
1666   SeqEntryPtr sep
1667 );
1668 
1669 NLM_EXTERN void RemoveNcbiAutofixUserObjects (
1670   SeqEntryPtr sep
1671 );
1672 
1673 /* Mark unverified sequences */
1674 
1675 NLM_EXTERN UserObjectPtr CreateUnverifiedUserObject (
1676   void
1677 );
1678 
1679 NLM_EXTERN UserObjectPtr FindUnverifiedUserObject (
1680   SeqEntryPtr sep
1681 );
1682 
1683 NLM_EXTERN UserObjectPtr AddUnverifiedUserObject (
1684   SeqEntryPtr sep
1685 );
1686 
1687 NLM_EXTERN UserObjectPtr AddUnverifiedUserObjectToBioseq (
1688   BioseqPtr bsp
1689 );
1690 
1691 NLM_EXTERN UserObjectPtr AddUnverifiedUserObjectToBioseqParent (
1692   BioseqPtr bsp
1693 );
1694 
1695 NLM_EXTERN void AddStringToUnverifiedUserObject (
1696   UserObjectPtr uop,
1697   CharPtr field,
1698   CharPtr str
1699 );
1700 
1701 NLM_EXTERN void RemoveUnverifiedUserObjects (
1702   SeqEntryPtr sep
1703 );
1704 
1705 NLM_EXTERN Boolean IsUnverifiedUserObject (
1706   UserObjectPtr uop
1707 );
1708 
1709 
1710 #ifdef __cplusplus
1711 }
1712 #endif
1713 
1714 #undef NLM_EXTERN
1715 #ifdef NLM_EXPORT
1716 #define NLM_EXTERN NLM_EXPORT
1717 #else
1718 #define NLM_EXTERN
1719 #endif
1720 
1721 #endif
1722 
1723 
1724 /*****************************************************************************
1725 *
1726 *   Allowed IUPAC nucleic acid codes from /ncbi/data/seqcode.prt
1727 *
1728                 ( symbol "A", name "Adenine" ),
1729                 ( symbol "B" , name "G or T or C" ),
1730                 ( symbol "C", name "Cytosine" ),
1731                 ( symbol "D", name "G or A or T" ),
1732                 ( symbol "G", name "Guanine" ),
1733                 ( symbol "H", name "A or C or T" ) ,
1734                 ( symbol "K", name "G or T" ),
1735                 ( symbol "M", name "A or C" ),
1736                 ( symbol "N", name "A or G or C or T" ) ,
1737                 ( symbol "R", name "G or A"),
1738                 ( symbol "S", name "G or C"),
1739                 ( symbol "T", name "Thymine"),
1740                 ( symbol "V", name "G or C or A"),
1741                 ( symbol "W", name "A or T" ),
1742                 ( symbol "Y", name "T or C")
1743 *
1744 *
1745 *****************************************************************************/
1746 
1747 /*****************************************************************************
1748 *
1749 *   Allowed IUPAC amino acid codes from /ncbi/data/seqcode.prt
1750 
1751                 ( symbol "A", name "Alanine" ),
1752                 ( symbol "B" , name "Asp or Asn" ),
1753                 ( symbol "C", name "Cysteine" ),
1754                 ( symbol "D", name "Aspartic Acid" ),
1755                 ( symbol "E", name "Glutamic Acid" ),
1756                 ( symbol "F", name "Phenylalanine" ),
1757                 ( symbol "G", name "Glycine" ),
1758                 ( symbol "H", name "Histidine" ) ,
1759                 ( symbol "I", name "Isoleucine" ),
1760                 ( symbol "J", name "Leu or Ile" ),
1761                 ( symbol "K", name "Lysine" ),
1762                 ( symbol "L", name "Leucine" ),
1763                 ( symbol "M", name "Methionine" ),
1764                 ( symbol "N", name "Asparagine" ) ,
1765                 ( symbol "O", name "Pyrrolysine" ),
1766                 ( symbol "P", name "Proline" ),
1767                 ( symbol "Q", name "Glutamine"),
1768                 ( symbol "R", name "Arginine"),
1769                 ( symbol "S", name "Serine"),
1770                 ( symbol "T", name "Threoine"),
1771                 { symbol "U", name "Selenocysteine"},
1772                 ( symbol "V", name "Valine"),
1773                 ( symbol "W", name "Tryptophan" ),
1774                 ( symbol "X", name "Undetermined or atypical"),
1775                 ( symbol "Y", name "Tyrosine"),
1776                 ( symbol "Z", name "Glu or Gln" )
1777 *
1778 *
1779 *****************************************************************************/
1780 
1781 /*****************************************************************************
1782 *
1783 *   Genetic Code id's and names from /ncbi/data/gc.prt
1784 *      gc.prt lists the legal start codons and genetic codes fully
1785 *
1786                 name "Standard" ,
1787                 id 1 ,
1788 
1789                 name "Vertebrate Mitochondrial" ,
1790                 id 2 ,
1791 
1792                 name "Yeast Mitochondrial" ,
1793                 id 3 ,
1794 
1795                 name "Mold Mitochondrial and Mycoplasma" ,
1796                 id 4 ,
1797 
1798                 name "Invertebrate Mitochondrial" ,
1799                 id 5 ,
1800 
1801                 name "Ciliate Macronuclear and Daycladacean" ,
1802                 id 6 ,
1803 
1804                 name "Echinoderm Mitochondrial" ,
1805                 id 9 ,
1806 
1807                 name "Euplotid Macronuclear" ,
1808                 id 10 ,
1809 
1810                 name "Bacterial and Plant Plastid" ,
1811                 id 11 ,
1812 
1813                 name "Alternative Yeast Nuclear" ,
1814                 id 12 ,
1815 
1816                 name "Ascidian Mitochondrial" ,
1817                 id 13 ,
1818 
1819                 name "Alternative Flatworm Mitochondrial" ,
1820                 id 14 ,
1821 
1822                 name "Blepharisma Macronuclear" ,
1823                 id 15 ,
1824 
1825                 name "Chlorophycean Mitochondrial" ,
1826                 id 16 ,
1827 
1828                 name "Trematode Mitochondrial" ,
1829                 id 21 ,
1830 
1831                 name "Scenedesmus obliquus Mitochondrial" ,
1832                 id 22 ,
1833 
1834                 name "Thraustochytrium Mitochondrial" ,
1835                 id 23 ,
1836 
1837 *
1838 *
1839 *****************************************************************************/
1840 
1841 
1842 
1843 
1844