1 /* subutil.h 2 * >> Set tabs to 4 spaces for a nice printout 3 * 4 * =========================================================================== 5 * 6 * PUBLIC DOMAIN NOTICE 7 * National Center for Biotechnology Information 8 * 9 * This software/database is a "United States Government Work" under the 10 * terms of the United States Copyright Act. It was written as part of 11 * the author's official duties as a United States Government employee and 12 * thus cannot be copyrighted. This software/database is freely available 13 * to the public for use. The National Library of Medicine and the U.S. 14 * Government have not placed any restriction on its use or reproduction. 15 * 16 * Although all reasonable efforts have been taken to ensure the accuracy 17 * and reliability of the software and data, the NLM and the U.S. 18 * Government do not and cannot warrant the performance or results that 19 * may be obtained by using this software or data. The NLM and the U.S. 20 * Government disclaim all warranties, express or implied, including 21 * warranties of performance, merchantability or fitness for any particular 22 * purpose. 23 * 24 * Please cite the author in any work or product based on this material. 25 * 26 * =========================================================================== 27 * 28 * File Name: subutil.h 29 * 30 * Author: James Ostell 31 * 32 * Version Creation Date: 11/3/93 33 * 34 * $Revision: 6.93 $ 35 * 36 * File Description: Utilities for creating ASN.1 submissions 37 * 38 * Modifications: 39 * -------------------------------------------------------------------------- 40 * Date Name Description of modification 41 * ------- ---------- ----------------------------------------------------- 42 * 43 * ========================================================================== 44 */ 45 46 #ifndef _NCBI_SubUtil_ 47 #define _NCBI_SubUtil_ 48 49 #ifndef _NCBI_Submit_ 50 #include <objsub.h> 51 #endif 52 53 #undef NLM_EXTERN 54 #ifdef NLM_IMPORT 55 #define NLM_EXTERN NLM_IMPORT 56 #else 57 #define NLM_EXTERN extern 58 #endif 59 60 #ifdef __cplusplus 61 extern "C" { 62 #endif 63 64 65 /***************************************************************************** 66 * 67 * Create a GenBank direct submission 68 * This supports a basic set of datatypes for making a new direct 69 * submission to GenBank in ASN.1. It is designed for folks wanting to 70 * read their own data storage format, then make a valid direct submission 71 * without going through an intermediate tool. 72 * 73 * You may have many "entries" in a single submission. A single entry 74 * may contain: 75 * One protein sequence (called a "Bioseq") 76 * One nucleic acid sequence (called a "Bioseq") 77 * One nucleic acid sequence for which you only have a series of 78 * sequence pieces (e.g. you sequenced around the exons of a 79 * genomic sequence, but not the introns) (called a "segmented 80 * set") 81 * One nucleic acid sequence and the protein sequences it codes for. 82 * (nucleic acid may a a single Bioseq or a segmented set) 83 * (this entry called a "nuc-prot set") 84 * 85 * NCBI considers the protein sequences part 86 * of the submission, and they are created as proteins in their own right 87 * by the routines below. You can either supply the protein sequence from 88 * your own software (best case), in which we check that the coding region 89 * you supply translates to it. If you do not supply a protein sequence, 90 * then all we can do is check that it translates without stops. 91 * 92 * NCBI also considers "gene" to refer to a region of nucleic acid 93 * within which are found elements (such as promoters, coding regions,etc) 94 * leading to a phenomenon recognized as a gene (note this also accomodates 95 * anonymous markers as well as expressed products). This is in contrast to 96 * so other notions that a gene is simply a qualifier on other features of 97 * the DNA. A separate function to produce a gene feature is supplied. The 98 * intervals given for it should include the intervals for the other 99 * features it contains. 100 * 101 * The process of building the direct submission is roughly: 102 * 103 * Create the submission 104 * Add the submission citation 105 * Create an entry (can be 1 or more sequences) 106 * Add the organism information 107 * Add any publication citations 108 * Add the sequences 109 * Fill in the residues 110 * Add the features 111 * Validate the entry 112 * Write the entry 113 * Free the memory used 114 * 115 * Each element may have subfunctions: 116 * 117 * Create a citation 118 * Add author names 119 * Add author affiliation 120 * 121 * Create a sequence 122 * Add modifiers 123 * 124 * Create a feature 125 * Add information specific to type of feature 126 * Add intervals on the sequence 127 * 128 *****************************************************************************/ 129 typedef Boolean (* SubErrorFunc) (CharPtr msg); 130 131 typedef struct ncbisub { 132 SeqSubmitPtr ssp; /* the submission */ 133 SubErrorFunc err_func; /* the error handler */ 134 Int2 gap_count; /* for unique gap names in segs */ 135 CharPtr submittor_key; /* used for turning local SeqId to General */ 136 } NCBISub, PNTR NCBISubPtr; 137 138 #define PubPtr ValNodePtr /* should really be typedeffed */ 139 140 /***************************************************************************** 141 * 142 * Prototypes for building a direct submission 143 * 144 *****************************************************************************/ 145 146 /* default error handler */ 147 148 NLM_EXTERN Boolean DefaultSubErrorFunc (CharPtr msg); 149 150 /***************************************************************************** 151 * 152 * Create/Free the NCBISub 153 * 154 *****************************************************************************/ 155 156 NLM_EXTERN NCBISubPtr NCBISubCreate ( 157 CharPtr last_name, 158 CharPtr first_name, 159 CharPtr middle_name, 160 CharPtr initials, /* separated by periods, no initial for last name */ 161 CharPtr suffix, /* Jr. Sr. III */ 162 CharPtr affil, /* e.g. "Xyz University" */ 163 CharPtr div, /* e.g. "Dept of Biology" */ 164 CharPtr street, /* e.g. "123 Academic Road" */ 165 CharPtr city, /* e.g. "Metropolis" */ 166 CharPtr sub, /* e.g. "Massachusetts" */ 167 CharPtr country , /* e.g. "USA" */ 168 CharPtr postal_code, /* e.g."02133" */ 169 CharPtr phone , 170 CharPtr fax , 171 CharPtr email, 172 Boolean hold_until_publish , 173 Int2 release_month , 174 Int2 release_day , 175 Int2 release_year ); 176 177 NLM_EXTERN Boolean DefineSubmittorKey( 178 NCBISubPtr nsp, 179 CharPtr submittor_key ); /* submitting large scale lab, for regular submissions */ 180 181 /**** WARNING: NCBISubBuild() is the old style submission ***/ 182 /**** It has been replaced by NCBISubCreate() ***/ 183 /**** NCBISubBuild will be discontinued ***/ 184 185 NLM_EXTERN NCBISubPtr NCBISubBuild ( 186 CharPtr name, 187 CharPtr PNTR address , 188 CharPtr phone , 189 CharPtr fax , 190 CharPtr email, 191 Boolean hold_until_publish , 192 Int2 release_month, 193 Int2 release_day, 194 Int2 release_year ); 195 196 197 198 /** every submission must have 1 submission citation **/ 199 /** see below to add authors and affiliation **********/ 200 201 NLM_EXTERN Boolean CitSubForSubmission ( 202 NCBISubPtr submission, 203 PubPtr cit_sub ); 204 205 NLM_EXTERN Boolean AddToolToSub ( 206 NCBISubPtr nsp, 207 CharPtr tool ); 208 209 NLM_EXTERN Boolean AddCommentToSub ( 210 NCBISubPtr nsp, 211 CharPtr comment ); 212 213 NLM_EXTERN Boolean AddTypeToSub ( 214 NCBISubPtr nsp, 215 Uint1 type ); 216 217 NLM_EXTERN Boolean NCBISubWrite ( 218 NCBISubPtr ssp, 219 CharPtr filename ); 220 221 NLM_EXTERN NCBISubPtr NCBISubFree ( 222 NCBISubPtr ssp ); 223 224 /***************************************************************************** 225 * 226 * You can (should) run the ncbi validator routines on your final submission. 227 * It returns a count of all errors or questions found. 228 * 229 * The errfile parameter is no longer supported. Errors are directed 230 * based on the ErrLog functions in the toolkit. If you have done none 231 * of this, then errors will appear on stderr. 232 * 233 *****************************************************************************/ 234 235 NLM_EXTERN Int2 NCBISubValidate (NCBISubPtr nsp, FILE * errfile); 236 237 /***************************************************************************** 238 * 239 * Add Entries to the Submission 240 * Add Sequences to the Entries 241 * 242 *****************************************************************************/ 243 244 /***************************************************************************** 245 * 246 * About Sequence Identifiers: 247 * 248 * Note that in all functions below where you create a Bioseq in your entry, 249 * you can supply a number of different pieces of information to make a 250 * sequence id. 251 * 252 * local_name: This is a string for whatever name you call this sequence 253 * locally. Could be a clone name or whatever. There are no 254 * limits on this other than it should be unique in the 255 * submission. It is REQUIRED. 256 * 257 * SeqEntryPtr: Returned by the function, this is a pointer to the Bioseq. 258 * 259 * In later functions, such as adding feature locations, you can refer to 260 * the Bioseq you created either with the local_name or directly with the 261 * SeqEntryPtr. Whatever is more convenient for you is fine. 262 * 263 * The other ids only apply to updates. These allow you to update your 264 * entry in GenBank simply by sending a new entry with the same accession 265 * number you were issued on the last one. In this case you should also 266 * be sure to add the create_date, which will be returned to you in the 267 * ASN.1 of your direct submission after processing. This is not absolutely 268 * required, but does let us check that it is the right entry (errors 269 * could occur when you enter your old accession number). 270 * 271 * genbank_locus: OPTIONAL on update. The name appearing on the LOCUS line. 272 * genbank_accession: REQUIRED on update. 273 * gi_number: OPTIONAL on update for now. The unique ID assigned by NCBI 274 * to a particular sequence (DNA or protein) in your entry. 275 * 276 * If you update your entry, whether you change the sequence or not, the 277 * accession number and locus will remain the same, so people can retrieve 278 * your new data with the old id. However, the gi_number is explicitly keyed 279 * to the sequence, and will change if there are any changes/additions to 280 * the sequence. In addition, a history will be created indicating the old 281 * gi_number and the date the new entry replaced it. Both old and new 282 * entries will be available from NCBI for retrieval on gi_number. Only the 283 * new entry will appear in the next GenBank or Entrez release. 284 * 285 *****************************************************************************/ 286 287 288 /*** Entry contains only 1 raw Bioseq ***/ 289 290 NLM_EXTERN SeqEntryPtr AddSeqOnlyToSubmission ( 291 NCBISubPtr submission , 292 CharPtr local_name , 293 CharPtr genbank_locus , 294 CharPtr genbank_accession , 295 Int4 gi_number , 296 Int2 molecule_class, 297 Int2 molecule_type , 298 Int4 length , 299 Int2 topology , 300 Int2 strandedness ); 301 302 /*** Entry contains a segmented set of Bioseqs ***/ 303 304 NLM_EXTERN SeqEntryPtr AddSegmentedSeqToSubmission ( 305 NCBISubPtr submission , 306 CharPtr local_name , 307 CharPtr genbank_locus , 308 CharPtr genbank_accession , 309 Int4 gi_number , 310 Int2 molecule_class, 311 Int2 molecule_type , 312 Int4 length , 313 Int2 topology , 314 Int2 strandedness ); 315 316 NLM_EXTERN SeqEntryPtr AddSeqToSegmentedEntry ( 317 NCBISubPtr submission, 318 SeqEntryPtr segmented_seq_entry, 319 CharPtr local_name , 320 CharPtr genbank_locus , 321 CharPtr genbank_accession , 322 Int4 gi_number , 323 Int2 molecule_class, 324 Int2 molecule_type , 325 Int4 length , 326 Int2 topology , 327 Int2 strandedness ); 328 329 NLM_EXTERN Boolean AddGapToSegmentedEntry ( 330 NCBISubPtr submission, 331 SeqEntryPtr segmented_seq_entry, 332 Int4 length_of_gap ); /** 0 if not known */ 333 334 NLM_EXTERN Boolean AddReferenceToSegmentedEntry ( 335 NCBISubPtr submission , 336 SeqEntryPtr segmented_seq_entry, 337 CharPtr genbank_accession , 338 Int4 gi_number , 339 Int4 from , 340 Int4 to , 341 Boolean on_plus_strand ); 342 343 /*** Entry contains sets of similar sequences ***/ 344 345 NLM_EXTERN SeqEntryPtr AddPopSetToSubmission ( 346 NCBISubPtr submission ); 347 348 NLM_EXTERN SeqEntryPtr AddPhySetToSubmission ( 349 NCBISubPtr submission ); 350 351 NLM_EXTERN SeqEntryPtr AddMutSetToSubmission ( 352 NCBISubPtr submission ); 353 354 NLM_EXTERN SeqEntryPtr AddGenBankSetToSubmission ( 355 NCBISubPtr submission ); 356 357 /*** Entry contains nucleotide and translated proteins ***/ 358 359 NLM_EXTERN SeqEntryPtr AddNucProtToSubmission ( 360 NCBISubPtr submission ); 361 362 NLM_EXTERN SeqEntryPtr AddSeqToNucProtEntry ( /** add unsegmented nuc or prot bioseq */ 363 NCBISubPtr submission, 364 SeqEntryPtr nuc_prot_entry, 365 CharPtr local_name , 366 CharPtr genbank_locus , 367 CharPtr genbank_accession , 368 Int4 gi_number , 369 Int2 molecule_class, 370 Int2 molecule_type , 371 Int4 length , 372 Int2 topology , 373 Int2 strandedness ); 374 /** add segmented nuc or prot bioseq set */ 375 376 NLM_EXTERN SeqEntryPtr AddSegmentedSeqToNucProtEntry ( 377 NCBISubPtr submission, 378 SeqEntryPtr nuc_prot_entry , 379 CharPtr local_name , 380 CharPtr genbank_locus , 381 CharPtr genbank_accession , 382 Int4 gi_number , 383 Int2 molecule_class, 384 Int2 molecule_type , 385 Int4 length , 386 Int2 topology , 387 Int2 strandedness ); 388 389 NLM_EXTERN SeqEntryPtr AddDeltaSeqToNucProtEntry ( 390 NCBISubPtr submission, 391 SeqEntryPtr nuc_prot_entry , 392 CharPtr local_name , 393 CharPtr genbank_locus , 394 CharPtr genbank_accession , 395 Int4 gi_number , 396 Int2 molecule_class, 397 Int2 molecule_type , 398 Int4 length , 399 Int2 topology , 400 Int2 strandedness ); 401 402 /**** Entry contains one delta sequence ****/ 403 404 NLM_EXTERN SeqEntryPtr AddDeltaSeqOnlyToSubmission ( 405 NCBISubPtr submission, 406 CharPtr local_name , 407 CharPtr genbank_locus , 408 CharPtr genbank_accession , 409 Int4 gi_number , 410 Int2 molecule_class, 411 Int2 molecule_type , 412 Int4 length , 413 Int2 topology , 414 Int2 strandedness ); 415 416 NLM_EXTERN Boolean AddGapToDeltaSeq ( 417 NCBISubPtr submission, 418 SeqEntryPtr delta_seq_entry, 419 Int4 length_of_gap ); /** 0 if not known */ 420 421 NLM_EXTERN SeqLitPtr AddFakeGapToDeltaSeq ( 422 NCBISubPtr submission, 423 SeqEntryPtr delta_seq_entry, 424 Int4 length_of_gap ); /** returns slp so program can set lim - unk fuzz after empty gaps are spread */ 425 426 NLM_EXTERN SeqLitPtr AddLiteralToDeltaSeq ( 427 NCBISubPtr submission, 428 SeqEntryPtr delta_seq_entry, 429 Int4 length_of_sequence ); 430 431 432 #define MOLECULE_CLASS_DNA 1 433 #define MOLECULE_CLASS_RNA 2 434 #define MOLECULE_CLASS_NUC 4 435 #define MOLECULE_CLASS_PROTEIN 3 436 437 #define MOLECULE_TYPE_GENOMIC 1 438 #define MOLECULE_TYPE_PRE_MRNA 2 439 #define MOLECULE_TYPE_MRNA 3 440 #define MOLECULE_TYPE_RRNA 4 441 #define MOLECULE_TYPE_TRNA 5 442 #define MOLECULE_TYPE_SNRNA 6 443 #define MOLECULE_TYPE_SCRNA 7 444 #define MOLECULE_TYPE_PEPTIDE 8 445 #define MOLECULE_TYPE_OTHER_GENETIC_MATERIAL 9 446 #define MOLECULE_TYPE_GENOMIC_MRNA_MIX 10 447 #define MOLECULE_TYPE_CRNA 11 448 #define MOLECULE_TYPE_SNORNA 12 449 #define MOLECULE_TYPE_TRANSCRIBED_RNA 13 450 #define MOLECULE_TYPE_NCRNA 14 451 #define MOLECULE_TYPE_TMRNA 15 452 453 #define TOPOLOGY_LINEAR 1 454 #define TOPOLOGY_CIRCULAR 2 455 #define TOPOLOGY_TANDEM 3 456 457 #define STRANDEDNESS_SINGLE 1 458 #define STRANDEDNESS_DOUBLE 2 459 460 /****************************************************************** 461 * 462 * Fill in Bases or Amino Acids 463 * 1) You may call functions as often per bioseq as you like 464 * up to the length of the Bioseq 465 * 2) All codes are iupac and defined in /ncbi/data/seqcode.prt 466 * as an ASN.1 file used by this code. Excerpts at the 467 * end of this file. Even though it's ASN.1 you will find 468 * you can read it with no trouble. 469 * 3) IUPAC codes are UPPER CASE. These functions will upper 470 * case for you. 471 * 4) In nucleic acids 'U' will be changed to 'T' 472 * 5) In both cases, non-letters will be stripped from the 473 * the input strings to facilate input from external 474 * formatted files with numbers and internal spaces and 475 * such. 476 * 477 ******************************************************************/ 478 479 NLM_EXTERN Boolean AddBasesToBioseq ( 480 NCBISubPtr submission , 481 SeqEntryPtr the_seq , 482 CharPtr the_bases ); 483 484 NLM_EXTERN Boolean AddAminoAcidsToBioseq ( 485 NCBISubPtr submission , 486 SeqEntryPtr the_seq , 487 CharPtr the_aas ); 488 489 /** variant functions for Delta sequences ***/ 490 491 NLM_EXTERN Boolean AddBasesToLiteral ( 492 NCBISubPtr submission , 493 SeqLitPtr the_literal , 494 CharPtr the_bases ); 495 496 NLM_EXTERN Boolean AddAminoAcidsToLiteral ( 497 NCBISubPtr submission , 498 SeqLitPtr the_literal , 499 CharPtr the_aas ); 500 501 502 /***************************************************************************** 503 * 504 * Add Annotations to Entries 505 * 506 *****************************************************************************/ 507 508 NLM_EXTERN Boolean AddTitleToEntry ( 509 NCBISubPtr submission, 510 SeqEntryPtr entry , 511 CharPtr title ); 512 513 NLM_EXTERN Boolean AddSecondaryAccnToEntry ( 514 NCBISubPtr submission, 515 SeqEntryPtr entry , 516 CharPtr accn ); 517 518 /***************************************************************** 519 * 520 * rules for long comments 521 * 1) include no non-ascii characters (e.g. \t \r \n) 522 * 2) you may force a line feed on display by using tilde '~' 523 * 3) you format a table by adding leading spaces after a '~' 524 * 4) non-ascii chars will be converted on input (also for 525 * title) \n='~', all others='#' 526 * 527 *****************************************************************/ 528 529 NLM_EXTERN Boolean AddCommentToEntry ( 530 NCBISubPtr submission, 531 SeqEntryPtr entry , 532 CharPtr comment ); 533 534 NLM_EXTERN Boolean AddOrganismToEntryNew ( 535 NCBISubPtr submission, 536 SeqEntryPtr entry , 537 CharPtr scientific_name , 538 CharPtr common_name , 539 CharPtr virus_name , 540 CharPtr strain , 541 CharPtr synonym1, 542 CharPtr synonym2, 543 CharPtr synonym3, 544 CharPtr taxonomy ); 545 546 /** AddOrganismToEntryNew() defaults to universal code (0) 547 ** for both cytoplasmic and mitochondiral ribosomes. You 548 ** also supply the code when you create a CdRegion. If the 549 ** CdRegion code does not match the organism code, the 550 ** validator will warn, but will translate by CdRegion code. 551 ** if you need an alternate code, SetGeneticCodeForEntry() 552 ** can be used to eliminate the conflict. See table of genetic 553 ** codes at end of this file. You should call 554 ** AddOrganismToEntryNew() before calling 555 ** SetGeneticCodeForEntry() **/ 556 557 NLM_EXTERN Boolean AddOrganismToEntryEx ( 558 NCBISubPtr submission, 559 SeqEntryPtr entry , 560 CharPtr scientific_name , 561 CharPtr common_name , 562 CharPtr virus_name , 563 CharPtr strain , 564 CharPtr synonym1, 565 CharPtr synonym2, 566 CharPtr synonym3, 567 CharPtr taxonomy, 568 Int4 taxid ); 569 570 /** AddOrganismToEntryEx() allows taxonID to be entered **/ 571 572 NLM_EXTERN Boolean SetGeneticCodeForEntry ( 573 NCBISubPtr submission, 574 SeqEntryPtr entry, 575 Uint1 genetic_code, /* for cytoplasm */ 576 Uint1 mito_code ); /* for mitochondria */ 577 578 579 580 /************************************************** 581 * OBSOLETE!!! do not use. Use AddOrganismToEntryNew 582 * 583 **************************************************/ 584 NLM_EXTERN Boolean AddOrganismToEntry ( 585 NCBISubPtr submission, 586 SeqEntryPtr entry , 587 CharPtr scientific_name , 588 CharPtr common_name , 589 CharPtr virus_name , 590 CharPtr strain , 591 CharPtr synonym1, 592 CharPtr synonym2, 593 CharPtr synonym3); 594 595 NLM_EXTERN Boolean AddGenBankBlockToEntry ( 596 NCBISubPtr submission, 597 SeqEntryPtr entry , 598 CharPtr taxonomy , 599 CharPtr division , 600 CharPtr keyword1 , 601 CharPtr keyword2 , 602 CharPtr keyword3 ); 603 604 #define GENOME_unknown 0 605 #define GENOME_genomic 1 606 #define GENOME_chloroplast 2 607 #define GENOME_chromoplast 3 608 #define GENOME_kinetoplast 4 609 #define GENOME_mitochondrion 5 610 #define GENOME_plastid 6 611 #define GENOME_macronuclear 7 612 #define GENOME_extrachrom 8 613 #define GENOME_plasmid 9 614 #define GENOME_transposon 10 615 #define GENOME_insertion_seq 11 616 #define GENOME_cyanelle 12 617 #define GENOME_proviral 13 618 #define GENOME_virion 14 619 #define GENOME_nucleomorph 15 620 #define GENOME_apicoplast 16 621 #define GENOME_leucoplast 17 622 #define GENOME_proplastid 18 623 #define GENOME_endogenous_virus 19 624 #define GENOME_hydrogenosome 20 625 #define GENOME_chromosome 21 626 #define GENOME_chromatophore 22 627 #define GENOME_plasmid_in_mitochondrion 23 628 #define GENOME_plasmid_in_plastid 24 629 630 /******************************************** 631 * Genome describes the type of genome from which the DNA or gene for 632 * a protein is located. Values are: 633 genome INTEGER { -- biological context 634 unknown (0) , 635 genomic (1) , 636 chloroplast (2) , 637 chromoplast (3) , 638 kinetoplast (4) , 639 mitochondrion (5) , 640 plastid (6) , 641 macronuclear (7) , 642 extrachrom (8) , 643 plasmid (9) , 644 transposon (10) , 645 insertion-seq (11) , 646 cyanelle (12) , 647 proviral (13) , 648 virion (14) } DEFAULT unknown , 649 more types added, see GENOME_.. above 650 **********************************************/ 651 652 NLM_EXTERN Boolean AddGenomeToEntry ( 653 NCBISubPtr submission, 654 SeqEntryPtr entry , 655 Int2 type ); 656 657 #define SUBSRC_chromosome 1 658 #define SUBSRC_map 2 659 #define SUBSRC_clone 3 660 #define SUBSRC_subclone 4 661 #define SUBSRC_haplotype 5 662 #define SUBSRC_genotype 6 663 #define SUBSRC_sex 7 664 #define SUBSRC_cell_line 8 665 #define SUBSRC_cell_type 9 666 #define SUBSRC_tissue_type 10 667 #define SUBSRC_clone_lib 11 668 #define SUBSRC_dev_stage 12 669 #define SUBSRC_frequency 13 670 #define SUBSRC_germline 14 671 #define SUBSRC_rearranged 15 672 #define SUBSRC_lab_host 16 673 #define SUBSRC_pop_variant 17 674 #define SUBSRC_tissue_lib 18 675 #define SUBSRC_plasmid_name 19 676 #define SUBSRC_transposon_name 20 677 #define SUBSRC_insertion_seq_name 21 678 #define SUBSRC_plastid_name 22 679 #define SUBSRC_country 23 680 #define SUBSRC_segment 24 681 #define SUBSRC_endogenous_virus_name 25 682 #define SUBSRC_transgenic 26 683 #define SUBSRC_environmental_sample 27 684 #define SUBSRC_isolation_source 28 685 #define SUBSRC_lat_lon 29 686 #define SUBSRC_collection_date 30 687 #define SUBSRC_collected_by 31 688 #define SUBSRC_identified_by 32 689 #define SUBSRC_fwd_primer_seq 33 690 #define SUBSRC_rev_primer_seq 34 691 #define SUBSRC_fwd_primer_name 35 692 #define SUBSRC_rev_primer_name 36 693 #define SUBSRC_metagenomic 37 694 #define SUBSRC_mating_type 38 695 #define SUBSRC_linkage_group 39 696 #define SUBSRC_haplogroup 40 697 #define SUBSRC_whole_replicon 41 698 #define SUBSRC_phenotype 42 699 #define SUBSRC_altitude 43 700 #define SUBSRC_other 255 701 702 /********************************************* 703 * SubSource defines subclasses of source material 704 * (also see OrgMod below for subclasses of organism names) 705 * 706 * allowed values for type are: 707 chromosome (1) , 708 map (2) , 709 clone (3) , 710 subclone (4) , 711 haplotype (5) , 712 genotype (6) , 713 sex (7) , 714 cell-line (8) , 715 cell-type (9) , 716 tissue-type (10) , 717 clone-lib (11) , 718 dev-stage (12) , 719 frequency (13) , 720 germline (14) , 721 rearranged (15) , 722 lab-host (16) , 723 pop-variant (17) , 724 tissue-lib (18) , 725 plasmid-name (19) , 726 transposon-name (20) , 727 insertion-seq-name (21) , 728 plastid-name (22) , 729 country (23) , 730 segment (24) , 731 endogenous-virus-name (25) , 732 transgenic (26) , 733 environmental-sample (27) , 734 isolation-source (28) , 735 lat-lon (29) , -- +/- decimal degrees 736 collection-date (30) , -- DD-MMM-YYYY format 737 collected-by (31) , -- name of person who collected the sample 738 identified-by (32) , -- name of person who identified the sample 739 fwd-primer-seq (33) , -- sequence (possibly more than one; semicolon-separated) 740 rev-primer-seq (34) , -- sequence (possibly more than one; semicolon-separated) 741 fwd-primer-name (35) , 742 rev-primer-name (36) , 743 metagenomic (37) , 744 mating-type (38) , 745 linkage-group (39) , 746 haplogroup (40) , 747 whole-replicon (41) , 748 phenotype (42) , 749 altitude (43) , 750 other (255) } , 751 752 * value is an optional string to give the name (eg. of the 753 * clone) 754 ******************************************/ 755 NLM_EXTERN Boolean AddSubSourceToEntry ( 756 NCBISubPtr submission, 757 SeqEntryPtr entry , 758 Int2 type , 759 CharPtr value); 760 761 #define ORGMOD_strain 2 762 #define ORGMOD_substrain 3 763 #define ORGMOD_type 4 764 #define ORGMOD_subtype 5 765 #define ORGMOD_variety 6 766 #define ORGMOD_serotype 7 767 #define ORGMOD_serogroup 8 768 #define ORGMOD_serovar 9 769 #define ORGMOD_cultivar 10 770 #define ORGMOD_pathovar 11 771 #define ORGMOD_chemovar 12 772 #define ORGMOD_biovar 13 773 #define ORGMOD_biotype 14 774 #define ORGMOD_group 15 775 #define ORGMOD_subgroup 16 776 #define ORGMOD_isolate 17 777 #define ORGMOD_common 18 778 #define ORGMOD_acronym 19 779 #define ORGMOD_dosage 20 780 #define ORGMOD_nat_host 21 781 #define ORGMOD_sub_species 22 782 #define ORGMOD_specimen_voucher 23 783 #define ORGMOD_authority 24 784 #define ORGMOD_forma 25 785 #define ORGMOD_forma_specialis 26 786 #define ORGMOD_ecotype 27 787 #define ORGMOD_synonym 28 788 #define ORGMOD_anamorph 29 789 #define ORGMOD_teleomorph 30 790 #define ORGMOD_breed 31 791 #define ORGMOD_gb_acronym 32 792 #define ORGMOD_gb_anamorph 33 793 #define ORGMOD_gb_synonym 34 794 #define ORGMOD_culture_collection 35 795 #define ORGMOD_bio_material 36 796 #define ORGMOD_metagenome_source 37 797 #define ORGMOD_type_material 38 798 #define ORGMOD_old_lineage 253 799 #define ORGMOD_old_name 254 800 #define ORGMOD_other 255 801 802 /* Defines for BioSrc.origin 803 */ 804 #define ORG_UNKNOWN 0 805 #define ORG_NATURAL 1 806 #define ORG_NATMUT 2 807 #define ORG_MUT 3 808 #define ORG_ARTIFICIAL 4 809 #define ORG_SYNTHETIC 5 810 #define ORG_OTHER 255 811 #define ORG_DEFAULT ORG_UNKNOWN 812 813 #define IS_ORG_UNKNOWN(S) ((S).origin == ORG_UNKNOWN) 814 #define IS_ORG_NATURAL(S) ((S).origin == ORG_NATURAL) 815 #define IS_ORG_NATMUT(S) ((S).origin == ORG_NATMUT) 816 #define IS_ORG_MUT(S) ((S).origin == ORG_MUT) 817 #define IS_ORG_ARTIFICIAL(S) ((S).origin == ORG_ARTIFICIAL) 818 #define IS_ORG_SYNTHETIC(S) ((S).origin == ORG_SYNTHETIC) 819 #define IS_ORG_OTHER(S) ((S).origin == ORG_OTHER) 820 821 822 /********************************************* 823 * OrgMod defines subclasses of organism names 824 * (also see SubSource above for subclasses of source material) 825 * 826 * allowed values for type are: 827 strain (2) , 828 substrain (3) , 829 type (4) , 830 subtype (5) , 831 variety (6) , 832 serotype (7) , 833 serogroup (8) , 834 serovar (9) , 835 cultivar (10) , 836 pathovar (11) , 837 chemovar (12) , 838 biovar (13) , 839 biotype (14) , 840 group (15) , 841 subgroup (16) , 842 isolate (17) , 843 common (18) , 844 acronym (19) , 845 dosage (20) , -- chromosome dosage of hybrid 846 nat-host (21) , -- natural host of this specimen 847 sub-species (22) , 848 specimen-voucher (23) , 849 authority (24) , 850 forma (25) , 851 forma-specialis (26) , 852 ecotype (27) , 853 synonym (28) , 854 anamorph (29) , 855 teleomorph (30) , 856 breed (31) , 857 gb-acronym (32) , -- used by taxonomy database 858 gb-anamorph (33) , -- used by taxonomy database 859 gb-synonym (34) , -- used by taxonomy database 860 culture-collection (35) , 861 bio-material (36) , 862 metagenome-source (37) , 863 old-lineage (253) , 864 old-name (254) , 865 other (255) } , -- ASN5: old-name (254) will be added to next spec 866 867 * value is an optional string to give the name (eg. of the 868 * varient) 869 ******************************************/ 870 NLM_EXTERN Boolean AddOrgModToEntry ( 871 NCBISubPtr submission, 872 SeqEntryPtr entry , 873 Int2 type , 874 CharPtr value); 875 876 /******************************************** 877 * Biomol describes the biological type of the molecule 878 * current values are: 879 biomol INTEGER { 880 unknown (0) , 881 genomic (1) , 882 pre-RNA (2) , -- precursor RNA of any sort really 883 mRNA (3) , 884 rRNA (4) , 885 tRNA (5) , 886 snRNA (6) , 887 scRNA (7) , 888 peptide (8) , 889 other-genetic (9) , -- other genetic material 890 genomic-mRNA (10) , -- reported a mix of genomic and cdna sequence 891 other (255) } DEFAULT unknown , 892 ********************************************/ 893 NLM_EXTERN Boolean AddBiomolToEntry ( 894 NCBISubPtr submission, 895 SeqEntryPtr entry , 896 Int2 type ); 897 898 /******************************************** 899 * 900 * What technique was used to get this sequence ? 901 * There are a set of defines in objpubd.h for this: 902 * Current list is: 903 #define MI_TECH_unknown 0 904 #define MI_TECH_standard 1 905 #define MI_TECH_est 2 EST division 906 #define MI_TECH_sts 3 STS division 907 #define MI_TECH_survey 4 GSS division 908 #define MI_TECH_genemap 5 Bioseq is a genetic map 909 #define MI_TECH_physmap 6 Bioseq is physical map 910 #define MI_TECH_derived 7 Bioseq is a computed inference 911 #define MI_TECH_concept_trans 8 conceptual translation 912 #define MI_TECH_seq_pept 9 peptide sequencing used 913 #define MI_TECH_both 10 combination of 8 and 9 used 914 #define MI_TECH_seq_pept_overlap 11 peptides ordered by overlap 915 #define MI_TECH_seq_pept_homol 12 peptides ordered by homology 916 #define MI_TECH_concept_trans_a 13 concept trans supplied by author 917 #define MI_TECH_other 255 doesnt' fit anything 918 *************************************** 919 * The following are not explicitly in the ASN.1 spec yet 920 * but can still be legally used as numbers. 921 * These are for High Throughput Genome Sequences 922 * htgs_1 - preliminary data. sequence is made of multiple 923 * contigs with gaps between them. The order of 924 * the contigs is not known, although for 925 * convenience they are in an arbitrary order 926 * htgs_2 - preliminary data. like htgs_1 except the 927 * order of the contigs is known and the sequence 928 * reflects the correct order 929 * htgs_3 - finished data. All annotations are machine 930 * generated in bulk. Usually this has been placed 931 * on a map 932 * 933 ****************************************** 934 #define MI_TECH_htgs_1 14 935 #define MI_TECH_htgs_2 15 936 #define MI_TECH_htgs_3 16 937 **********************************************************/ 938 NLM_EXTERN Boolean AddTechToEntry ( 939 NCBISubPtr submission, 940 SeqEntryPtr entry , 941 Int2 tech ); 942 943 /******************************************** 944 * How complete is the molecule? 945 * here are the allowed values: 946 * 947 completeness INTEGER { 948 unknown (0) , 949 complete (1) , -- complete biological entity 950 partial (2) , -- partial but no details given 951 no-left (3), -- KNOWN missing 5' or NH3 end 952 no-right (4) , -- KNOWN missing 3' or COOH end 953 no-ends (5) , -- KNOWN missing both ends 954 has-left (6) , -- KNOWN has complete 5' or NH3 end 955 has-right (7) , -- KNOWN has complete 3' or COOH end 956 other (255) } DEFAULT unknown } 957 958 *******************************************/ 959 NLM_EXTERN Boolean AddCompleteToEntry ( 960 NCBISubPtr submission, 961 SeqEntryPtr entry , 962 Int2 complete ); 963 964 NLM_EXTERN void AddCompleteness(NCBISubPtr submission, SeqEntryPtr sep, SeqFeatPtr sfp); 965 966 /**** OBSOLETE!!!! *********************************************** 967 * DO NOT USE GIBBmethod 968 * this is subsumed into AddTechToEntry, above 969 * 970 ****************************************************************/ 971 972 NLM_EXTERN Boolean AddGIBBmethodToEntry ( 973 NCBISubPtr submission, 974 SeqEntryPtr entry , 975 Int2 method ); 976 977 #define METHOD_concept_transl 1 978 #define METHOD_seq_pept 2 979 #define METHOD_both 3 980 #define METHOD_seq_pept_overlap 4 981 #define METHOD_seq_pept_homol 5 982 #define METHOD_concept_transl_a 6 983 #define METHOD_other 255 984 985 NLM_EXTERN Boolean AddCreateDateToEntry ( 986 NCBISubPtr submission, 987 SeqEntryPtr entry , 988 Int2 month , 989 Int2 day , 990 Int2 year ); 991 992 /************************************************************************* 993 * 994 * Modifiers modify the meaning of all entries in the set or sequence 995 * to which they are applied. This is particularly important for 996 * indicating organelle sequences, RNA genomes, or mutants. 997 * 998 * Less obvious is indicating completness. 999 * 1000 * A genomic sequence is assumed to be partial unless the "complete" 1001 * modifier is used. 1002 * A peptide sequence is assumed to be complete unless the "partial" 1003 * modifier is used. 1004 * A cDNA is assumed to be complete (as well as one can tell) unless 1005 * "partial" is used. 1006 * 1007 * A genomic sequence is assumed to be nuclear unless the "mitochondrial" 1008 * (or other organelle) modifier is used. 1009 * All sequences are assumed to be natural unless "synthetic", 1010 * "recombinant", or "mutagen" are used. 1011 * 1012 *************************************************************************/ 1013 1014 /*************************************** 1015 * Adds a ValNode of the appropriate type 1016 * to the SeqEntry 1017 * Note that caller must still create the 1018 * specific descriptor structure and attach it to 1019 * the returned ValNode 1020 * 1021 ****************************************/ 1022 1023 NLM_EXTERN ValNodePtr NewDescrOnSeqEntry (SeqEntryPtr entry, Int2 type); 1024 1025 NLM_EXTERN ValNodePtr GetDescrOnSeqEntry ( 1026 SeqEntryPtr entry, 1027 Int2 type); 1028 1029 NLM_EXTERN Boolean AddModifierToEntry ( 1030 NCBISubPtr submission, 1031 SeqEntryPtr entry , 1032 Int2 modifier ); 1033 1034 #define MODIF_dna 0 1035 #define MODIF_rna 1 1036 #define MODIF_extrachrom 2 1037 #define MODIF_plasmid 3 1038 #define MODIF_mitochondrial 4 1039 #define MODIF_chloroplast 5 1040 #define MODIF_kinetoplast 6 1041 #define MODIF_cyanelle 7 1042 #define MODIF_synthetic 8 /* synthetic sequence */ 1043 #define MODIF_recombinant 9 /* recombinant construct */ 1044 #define MODIF_partial 10 1045 #define MODIF_complete 11 1046 #define MODIF_mutagen 12 /* subject of mutagenesis ? */ 1047 #define MODIF_natmut 13 /* natural mutant ? */ 1048 #define MODIF_transposon 14 1049 #define MODIF_insertion_seq 15 1050 #define MODIF_no_left 16 /* missing left end (5' for na, NH2 for aa) */ 1051 #define MODIF_no_right 17 /* missing right end (3' or COOH) */ 1052 #define MODIF_macronuclear 18 1053 #define MODIF_proviral 19 1054 #define MODIF_est 20 /* expressed sequence tag */ 1055 1056 1057 /*** add/build publications ***/ 1058 NLM_EXTERN Boolean AddPubToEntry ( 1059 NCBISubPtr submission, 1060 SeqEntryPtr entry , 1061 PubPtr pub ); 1062 1063 NLM_EXTERN PubPtr CitSubBuild ( /* for first data submission **/ 1064 NCBISubPtr submission, 1065 Int2 month, 1066 Int2 day, 1067 Int2 year, 1068 Int2 medium ); 1069 1070 1071 NLM_EXTERN PubPtr CitSubUpdateBuild ( /* for updates to existing record */ 1072 NCBISubPtr submission, 1073 Int2 month, 1074 Int2 day, 1075 Int2 year , 1076 Int2 medium , 1077 CharPtr descr ); /* description of update, make it short */ 1078 1079 #define MEDIUM_NOT_SET 0 1080 #define MEDIUM_PAPER 1 1081 #define MEDIUM_TAPE 2 1082 #define MEDIUM_FLOPPY 3 1083 #define MEDIUM_EMAIL 4 1084 #define MEDIUM_OTHER 255 1085 1086 NLM_EXTERN PubPtr CitArtBuild ( 1087 NCBISubPtr submission, 1088 CharPtr title , 1089 CharPtr journal , 1090 CharPtr volume , 1091 CharPtr issue , 1092 CharPtr pages , 1093 Int2 month , 1094 Int2 day , 1095 Int2 year , 1096 Int2 status ); 1097 1098 #define PUB_STATUS_PUBLISHED 0 1099 #define PUB_STATUS_SUBMITTED 1 1100 #define PUB_STATUS_IN_PRESS 2 1101 #define PUB_STATUS_UNPUBLISHED 3 1102 1103 /************************************************************************* 1104 * 1105 * Author names can be given in various forms 1106 * You MUST give at least a last name 1107 * You should give at least first name or initials. 1108 * Initials are just for first and middle names, and are 1109 * separated by periods. 1110 * 1111 * example: John Q. Public 1112 * last_name = "Public" 1113 * first_name = "John" 1114 * middle_name = NULL 1115 * initials = "J.Q." 1116 * 1117 *************************************************************************/ 1118 1119 1120 NLM_EXTERN Boolean AddAuthorToPub ( /* call once for each author, in order */ 1121 NCBISubPtr submission, 1122 PubPtr the_pub, 1123 CharPtr last_name, 1124 CharPtr first_name, 1125 CharPtr middle_name, 1126 CharPtr initials, /* separated by periods, no initial for last name */ 1127 CharPtr suffix ); /* Jr. Sr. III */ 1128 1129 1130 /************************************************************************* 1131 * 1132 * Author Affiliation 1133 * only one allowed per pub (one per author is also possible, but is 1134 * not supported by this interface ) 1135 * 1136 * affil = institutional affiliation 1137 * div = division of institution 1138 * street = street address 1139 * city = city 1140 * sub = subdivision of country (e.g. state.. optional) 1141 * country = country 1142 * postal_code = zip code in the USA 1143 * 1144 *************************************************************************/ 1145 1146 1147 NLM_EXTERN Boolean AddAffiliationToPub ( /* call once per pub */ 1148 NCBISubPtr submission, 1149 PubPtr the_pub, 1150 CharPtr affil, /* e.g. "Xyz University" */ 1151 CharPtr div, /* e.g. "Dept of Biology" */ 1152 CharPtr street, /* e.g. "123 Academic Road" */ 1153 CharPtr city, /* e.g. "Metropolis" */ 1154 CharPtr sub, /* e.g. "Massachusetts" */ 1155 CharPtr country , /* e.g. "USA" */ 1156 CharPtr postal_code ); /* e.g."02133" */ 1157 1158 1159 /***************************************************************************** 1160 * 1161 * Add Features to the entry 1162 * Add location to feature 1163 * Add info for specific types to feature 1164 * 1165 *****************************************************************************/ 1166 NLM_EXTERN SeqFeatPtr FeatureBuild ( 1167 NCBISubPtr submission, 1168 SeqEntryPtr entry_to_put_feature, 1169 Boolean feature_is_partial, 1170 Uint1 evidence_is_experimental, 1171 Boolean biological_exception, 1172 CharPtr comment ); 1173 1174 #define EVIDENCE_NOT_SET 0 1175 #define EVIDENCE_EXPERIMENTAL 1 1176 #define EVIDENCE_NOT_EXPERIMENTAL 2 1177 1178 /************************************************************************* 1179 * 1180 * About feature locations: 1181 * Internally the NCBI software represents locations on sequence as 1182 * offsets from the start of the sequence (i.e. from 0 - (length -1)). 1183 * Also, the "from" position is always <= "to", even for locations on 1184 * the minus strand. Finally, no location can cross the origin of a 1185 * circular sequence.. it must be split in two. This makes routines 1186 * that access locations very consistent and easy to write. 1187 * 1188 * However, most biologists number sequences starting with 1, not 0. 1189 * It is natural to think of a coding region on the minus strand going 1190 * from 5243 to 2993. And it is not unusual to think of the origin of 1191 * replication being from 5344 to 10 on the plus strand of a circular 1192 * sequence. 1193 * 1194 * AddIntervalToFeature and AddPointToFeature were written to support 1195 * the biological notion. They convert to the internal format 1196 * automatically. So, for these two functions: 1197 * 1198 * 1) numbers are in the range 1 - length 1199 * 2) from <= to on plus strand 1200 * to <= from on minus strand 1201 * 3) numbers not conforming to (2) are assumed to go around the origin 1202 * of a circular sequence. It is an error on a linear sequence. 1203 * 4) Intervals should be added in biological order (e.g. exon1, exon2, 1204 * exon3...) no matter which strand the feature is on. 1205 * 5) You must always indicate explicitly the Bioseq the interval is 1206 * on. You may either pass in the SeqEntryPtr or the local_name you 1207 * used when you created the sequence. The sequence must have 1208 * been previously created with AddSeqTo... If you give both the 1209 * SeqEntryPtr and the local_name, they must agree. 1210 * 6) -1 (minus one) is a short hand for "end of sequence". To indicate 1211 * the whole sequence you can give from = 1, to = -1 1212 * 1213 *************************************************************************/ 1214 1215 NLM_EXTERN Boolean AddIntervalToFeature ( 1216 NCBISubPtr submission, 1217 SeqFeatPtr sfp, 1218 SeqEntryPtr the_seq , 1219 CharPtr local_name , 1220 Int4 from , 1221 Int4 to , 1222 Boolean on_plus_strand , 1223 Boolean start_before_from , 1224 Boolean stop_after_to ); 1225 1226 NLM_EXTERN Boolean AddIntToSeqLoc ( 1227 SeqLocPtr PNTR old_slp, 1228 Int4 from, 1229 Int4 to, 1230 SeqIdPtr sip, 1231 Int2 fuzz_from, 1232 Int2 fuzz_to, 1233 Int2 strand); 1234 1235 NLM_EXTERN Boolean AddIntToSeqFeat ( 1236 SeqFeatPtr sfp, 1237 Int4 from, 1238 Int4 to, 1239 BioseqPtr bsp, 1240 Int2 fuzz_from, 1241 Int2 fuzz_to, 1242 Int2 strand); 1243 1244 NLM_EXTERN Boolean AddPointToFeature ( 1245 NCBISubPtr submission, 1246 SeqFeatPtr sfp, 1247 SeqEntryPtr the_seq , 1248 CharPtr local_name , 1249 Int4 location , 1250 Boolean on_plus_strand , 1251 Boolean is_after_location , 1252 Boolean is_before_location ); 1253 1254 NLM_EXTERN Boolean AddPntToSeqLoc ( 1255 SeqLocPtr PNTR p_slp, 1256 Int4 point, 1257 BioseqPtr bsp, 1258 Int2 fuzz, 1259 Int2 strand); 1260 1261 NLM_EXTERN Boolean AddPntToSeqFeat ( 1262 SeqFeatPtr sfp, 1263 Int4 point, 1264 BioseqPtr bsp, 1265 Int2 fuzz, 1266 Int2 strand); 1267 1268 /************************************************************************* 1269 * 1270 * Having made a generalized feature, now add type specific info to it. 1271 * 1272 *************************************************************************/ 1273 1274 /************************************************************ 1275 * 1276 * A comment is the simplest feature. It is required that you 1277 * supplied a "comment" argument to FeatureBuild. In GenBank format 1278 * it will appear as misc_feat, with the comment appearing as the 1279 * \note. 1280 ***************************************************************/ 1281 1282 NLM_EXTERN Boolean MakeCommentFeature ( 1283 NCBISubPtr submission, 1284 SeqFeatPtr feature ); 1285 1286 /***************************************************************** 1287 * 1288 * This connects a protein sequence with the nucleic acid 1289 * region which codes for it. So the protein is given as an 1290 * argument, as well as adding intervals on the nucleic acid. 1291 * A complete coding region includes the initial Met codon and 1292 * the final termination codon. 1293 * 1294 *****************************************************************/ 1295 1296 1297 NLM_EXTERN Boolean MakeCdRegionFeature ( 1298 NCBISubPtr submission, 1299 SeqFeatPtr feature, 1300 Int2 frame , 1301 Int2 genetic_code , /* see end of this file for genetic codes */ 1302 SeqEntryPtr protein_product, /* give id of protein. if NULL, call */ 1303 CharPtr local_id_for_protein); /* function below to create by transl */ 1304 1305 1306 /****************************************************************** 1307 * 1308 * A Code-break allows an exception to be made in the translation 1309 * of a particular codon. You must give positions of the first 1310 * and last bases of the codon in the DNA sequence and the amino 1311 * acid to place there, instead of the normal translation. This 1312 * should be used sparingly, and a comment on the feature should 1313 * explain why it was done. 1314 * 1315 * The location is specified the same as in AddIntervalToFeature. 1316 * AA_for_protein points the amino acid to use, in ncbieaa code. 1317 * 1318 ******************************************************************/ 1319 1320 NLM_EXTERN Boolean AddCodeBreakToCdRegion ( 1321 NCBISubPtr submission, 1322 SeqFeatPtr sfp, 1323 SeqEntryPtr the_seq , 1324 CharPtr local_name , 1325 Int4 from , 1326 Int4 to , 1327 Boolean on_plus_strand , 1328 CharPtr AA_for_protein ); 1329 1330 1331 /****************************************************************** 1332 * 1333 * Special function to make protein from CdRegion feature 1334 * 1335 ******************************************************************/ 1336 1337 NLM_EXTERN SeqEntryPtr TranslateCdRegion ( 1338 NCBISubPtr submission , 1339 SeqFeatPtr cdregion_feature , 1340 SeqEntryPtr nuc_prot_entry_to_put_sequence , 1341 CharPtr local_name , /* for protein sequence */ 1342 CharPtr genbank_locus , 1343 CharPtr genbank_accession , 1344 Int4 gi_number ); 1345 1346 NLM_EXTERN Boolean MakeRNAFeature ( 1347 NCBISubPtr submission, 1348 SeqFeatPtr feature, 1349 Int2 rna_type , 1350 Boolean is_pseudo_gene, 1351 CharPtr rna_name , 1352 CharPtr AA_for_tRNA , 1353 CharPtr codon_for_tRNA ); 1354 1355 #define RNA_TYPE_premsg 1 1356 #define RNA_TYPE_mRNA 2 1357 #define RNA_TYPE_tRNA 3 1358 #define RNA_TYPE_rRNA 4 1359 #define RNA_TYPE_snRNA 5 1360 #define RNA_TYPE_scRNA 6 1361 #define RNA_TYPE_snoRNA 7 1362 #define RNA_TYPE_ncRNA 8 1363 #define RNA_TYPE_tmRNA 9 1364 #define RNA_TYPE_misc_RNA 10 1365 #define RNA_TYPE_other 255 1366 1367 /****************************************************************** 1368 * 1369 * Once you have made a tRNA feature, you may optionally add the 1370 * the location of the anticodon if you know it. This should be 1371 * within the range of the tRNA feature already created, obviously. 1372 * 1373 * the location is specified on the DNA the same as for 1374 * AddIntervalToFeature 1375 * 1376 ******************************************************************/ 1377 1378 NLM_EXTERN Boolean AddAntiCodonTotRNA ( 1379 NCBISubPtr submission, 1380 SeqFeatPtr sfp, 1381 SeqEntryPtr the_seq , 1382 CharPtr local_name , 1383 Int4 from , 1384 Int4 to , 1385 Boolean on_plus_strand ); 1386 1387 NLM_EXTERN Boolean MakeGeneFeature ( 1388 NCBISubPtr submission, 1389 SeqFeatPtr feature, 1390 CharPtr gene_symbol_for_locus , 1391 CharPtr allele , 1392 CharPtr descriptive_name , 1393 CharPtr map_location , 1394 Boolean is_pseudo_gene , 1395 CharPtr genetic_database , 1396 CharPtr gene_id_in_genetic_database , 1397 CharPtr synonym1 , 1398 CharPtr synonym2 , 1399 CharPtr synonym3 ); 1400 1401 NLM_EXTERN Boolean MakeProteinFeature ( 1402 NCBISubPtr submission, 1403 SeqFeatPtr feature , 1404 CharPtr protein_name1, 1405 CharPtr protein_name2, 1406 CharPtr protein_name3, 1407 CharPtr descriptive_name, 1408 CharPtr ECnum1, 1409 CharPtr ECnum2, 1410 CharPtr activity1, 1411 CharPtr activity2, 1412 CharPtr protein_database, 1413 CharPtr id_in_protein_database); 1414 1415 NLM_EXTERN Boolean MakeRegionFeature ( 1416 NCBISubPtr submission, 1417 SeqFeatPtr feature , 1418 CharPtr region_name ); 1419 1420 NLM_EXTERN Boolean MakeSiteFeature ( 1421 NCBISubPtr submission, 1422 SeqFeatPtr feature , 1423 Int2 site_type ); 1424 1425 NLM_EXTERN Boolean MakeImpFeature ( 1426 NCBISubPtr submission, 1427 SeqFeatPtr feature , 1428 CharPtr key ); 1429 1430 NLM_EXTERN Boolean AddQualToImpFeature ( 1431 NCBISubPtr submission, 1432 SeqFeatPtr imp_feature , 1433 CharPtr qualifier , 1434 CharPtr value ); 1435 1436 NLM_EXTERN Boolean MakePubFeature ( 1437 NCBISubPtr submission, 1438 SeqFeatPtr feature, 1439 PubPtr pub ); 1440 1441 NLM_EXTERN Boolean AddBasesToByteStore (ByteStorePtr bsp, CharPtr the_bases); 1442 1443 NLM_EXTERN Boolean AddAAsToByteStore (ByteStorePtr bsp, CharPtr the_aas); 1444 1445 /***************************************************************************** 1446 * 1447 * AddPhrapGraph (submission, the_seq, local_name, phrap_values) 1448 * Converts phrap byte array to SeqGraph, wraps in SeqAnnot, adds to Bioseq. 1449 * The length of data in the array must be equal to the length of the Bioseq. 1450 * 1451 *****************************************************************************/ 1452 1453 NLM_EXTERN Boolean AddPhrapGraph ( 1454 NCBISubPtr submission, 1455 SeqEntryPtr the_seq , 1456 CharPtr local_name , 1457 BytePtr phrap_values ); 1458 1459 NLM_EXTERN Boolean AddPhrapGraphToSeqLit ( 1460 NCBISubPtr submission, 1461 SeqLitPtr slp , 1462 BytePtr phrap_values ); 1463 1464 /* internal functions for reference gene project */ 1465 NLM_EXTERN UserObjectPtr CreateRefGeneTrackUserObject (void); 1466 NLM_EXTERN void AddStatusToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr status); 1467 NLM_EXTERN void AddGeneratedToRefGeneTrackUserObject (UserObjectPtr uop, Boolean generated); 1468 NLM_EXTERN void AddCuratorToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr collaborator); 1469 NLM_EXTERN void AddCuratorURLToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr url); 1470 NLM_EXTERN void AddSourceToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr genomicSource); 1471 NLM_EXTERN void AddAccessionToRefGeneTrackUserObject (UserObjectPtr uop, CharPtr field, 1472 CharPtr accn, Int4 gi, Int4 from, 1473 Int4 to, CharPtr comment); 1474 1475 /* experimental function to associate mRNA with protein product in cases of alt splicing */ 1476 NLM_EXTERN UserObjectPtr CreateMrnaProteinLinkUserObject (BioseqPtr protbsp); 1477 1478 /* vector screen, validator count, general submission comment user object (JP) */ 1479 NLM_EXTERN UserObjectPtr CreateSubmissionUserObject (CharPtr univecComment, 1480 CharPtr additionalComment, 1481 Int4 validatorErrorCount, 1482 Int4 validatorHashCode, 1483 Boolean isCloningVector); 1484 1485 /* clone name and ID for genomic contig RefSeq records */ 1486 NLM_EXTERN UserObjectPtr CreateContigCloneUserObject (CharPtr name, Int4 ID); 1487 1488 /* gene ontology process, component, and function user object */ 1489 NLM_EXTERN UserObjectPtr CreateGeneOntologyUserObject ( 1490 void 1491 ); 1492 NLM_EXTERN void AddToGeneOntologyUserObject ( 1493 UserObjectPtr uop, 1494 CharPtr type, 1495 CharPtr text, 1496 CharPtr goid, 1497 Int4 pmid, 1498 CharPtr goref, 1499 CharPtr evidence 1500 ); 1501 1502 /* model evidence user object */ 1503 NLM_EXTERN UserObjectPtr CreateModelEvidenceUserObject ( 1504 CharPtr method, 1505 CharPtr contigParent 1506 ); 1507 NLM_EXTERN void AddMrnaOrESTtoModelEvidence ( 1508 UserObjectPtr uop, 1509 CharPtr type, 1510 CharPtr accn, 1511 Int4 length, 1512 Int4 gaplen 1513 ); 1514 NLM_EXTERN UserFieldPtr FindModelEvidenceField ( 1515 UserObjectPtr uop, 1516 CharPtr type 1517 ); 1518 1519 /* third party accession list user object manipulation */ 1520 NLM_EXTERN UserObjectPtr CreateTpaAssemblyUserObject ( 1521 void 1522 ); 1523 NLM_EXTERN UserFieldPtr CreateTPAAssemblyAccessionField (CharPtr accn); 1524 NLM_EXTERN UserFieldPtr CreateTPAAssemblyFromField (Int4 from); 1525 NLM_EXTERN UserFieldPtr CreateTPAAssemblyToField (Int4 to); 1526 1527 NLM_EXTERN void AddAccessionToTpaAssemblyUserObject ( 1528 UserObjectPtr uop, 1529 CharPtr accn, 1530 Int4 from, 1531 Int4 to 1532 ); 1533 1534 NLM_EXTERN UserObjectPtr CreateGenomeProjectsDBUserObject ( 1535 void 1536 ); 1537 NLM_EXTERN UserObjectPtr AddIDsToGenomeProjectsDBUserObject ( 1538 UserObjectPtr uop, 1539 Int4 projectID, 1540 Int4 parentID 1541 ); 1542 1543 /* annot desc comment policy user object */ 1544 NLM_EXTERN UserObjectPtr CreateAnnotDescCommentPolicyUserObject ( 1545 Boolean showInCommentBlock 1546 ); 1547 1548 /* feature fetch policy user object */ 1549 1550 NLM_EXTERN UserObjectPtr CreateFeatureFetchPolicyUserObject ( 1551 CharPtr policy 1552 ); 1553 1554 /* structured comment user object for flatfile presentation */ 1555 1556 NLM_EXTERN UserObjectPtr CreateStructuredCommentUserObject ( 1557 CharPtr prefix, 1558 CharPtr suffix 1559 ); 1560 1561 NLM_EXTERN void AddItemStructuredCommentUserObject ( 1562 UserObjectPtr uop, 1563 CharPtr field, 1564 CharPtr str 1565 ); 1566 1567 /* DBLink user object for flatfile presentation */ 1568 1569 NLM_EXTERN UserObjectPtr CreateDBLinkUserObject ( 1570 void 1571 ); 1572 1573 NLM_EXTERN void AddIntListFieldToDBLinkUserObject ( 1574 UserObjectPtr uop, 1575 Int4 num, 1576 Int4Ptr values, 1577 CharPtr field_name 1578 ); 1579 1580 NLM_EXTERN void AddTraceAssemblyIDsToDBLinkUserObject ( 1581 UserObjectPtr uop, 1582 Int4 num, 1583 Int4Ptr values 1584 ); 1585 1586 NLM_EXTERN void AddStringListFieldToDBLinkUserObject ( 1587 UserObjectPtr uop, 1588 Int4 num, 1589 CharPtr PNTR values, 1590 CharPtr field_name 1591 ); 1592 1593 NLM_EXTERN void AddBioSampleIDsToDBLinkUserObject ( 1594 UserObjectPtr uop, 1595 Int4 num, 1596 CharPtr PNTR values 1597 ); 1598 1599 NLM_EXTERN void AddSeqReadArchIDsToDBLinkUserObject ( 1600 UserObjectPtr uop, 1601 Int4 num, 1602 CharPtr PNTR values 1603 ); 1604 1605 NLM_EXTERN void AddProbeDBIDsToDBLinkUserObject ( 1606 UserObjectPtr uop, 1607 Int4 num, 1608 CharPtr PNTR values 1609 ); 1610 1611 NLM_EXTERN void AddSeqReadArchiveIDsToDBLinkUserObject ( 1612 UserObjectPtr uop, 1613 Int4 num, 1614 CharPtr PNTR values 1615 ); 1616 1617 NLM_EXTERN void AddBioProjectIDsToDBLinkUserObject ( 1618 UserObjectPtr uop, 1619 Int4 num, 1620 CharPtr PNTR values 1621 ); 1622 1623 /* NcbiCleanup user object for SeriousSeqEntryCleanup time/version stamp */ 1624 1625 NLM_EXTERN UserObjectPtr CreateNcbiCleanupUserObject ( 1626 void 1627 ); 1628 1629 NLM_EXTERN void AddStringToNcbiCleanupUserObject ( 1630 UserObjectPtr uop, 1631 CharPtr field, 1632 CharPtr str 1633 ); 1634 1635 NLM_EXTERN void AddIntegerToNcbiCleanupUserObject ( 1636 UserObjectPtr uop, 1637 CharPtr field, 1638 Int4 num 1639 ); 1640 1641 /* FindNcbiCleanupUserObject returns user object on top Seq-entry */ 1642 1643 NLM_EXTERN UserObjectPtr FindNcbiCleanupUserObject ( 1644 SeqEntryPtr sep 1645 ); 1646 1647 NLM_EXTERN void RemoveAllNcbiCleanupUserObjects ( 1648 SeqEntryPtr sep 1649 ); 1650 1651 /* Also can put NcbiCleanupUserObject on Seq-annot Annot-desc */ 1652 1653 NLM_EXTERN UserObjectPtr FindSeqAnnotCleanupUserObj ( 1654 SeqAnnotPtr sap 1655 ); 1656 1657 NLM_EXTERN void RemoveAllSeqAnnotCleanupUserObjs ( 1658 SeqAnnotPtr sap 1659 ); 1660 1661 NLM_EXTERN UserObjectPtr FindNcbiAutofixUserObject ( 1662 SeqEntryPtr sep 1663 ); 1664 1665 NLM_EXTERN void AddNcbiAutofixUserObject ( 1666 SeqEntryPtr sep 1667 ); 1668 1669 NLM_EXTERN void RemoveNcbiAutofixUserObjects ( 1670 SeqEntryPtr sep 1671 ); 1672 1673 /* Mark unverified sequences */ 1674 1675 NLM_EXTERN UserObjectPtr CreateUnverifiedUserObject ( 1676 void 1677 ); 1678 1679 NLM_EXTERN UserObjectPtr FindUnverifiedUserObject ( 1680 SeqEntryPtr sep 1681 ); 1682 1683 NLM_EXTERN UserObjectPtr AddUnverifiedUserObject ( 1684 SeqEntryPtr sep 1685 ); 1686 1687 NLM_EXTERN UserObjectPtr AddUnverifiedUserObjectToBioseq ( 1688 BioseqPtr bsp 1689 ); 1690 1691 NLM_EXTERN UserObjectPtr AddUnverifiedUserObjectToBioseqParent ( 1692 BioseqPtr bsp 1693 ); 1694 1695 NLM_EXTERN void AddStringToUnverifiedUserObject ( 1696 UserObjectPtr uop, 1697 CharPtr field, 1698 CharPtr str 1699 ); 1700 1701 NLM_EXTERN void RemoveUnverifiedUserObjects ( 1702 SeqEntryPtr sep 1703 ); 1704 1705 NLM_EXTERN Boolean IsUnverifiedUserObject ( 1706 UserObjectPtr uop 1707 ); 1708 1709 1710 #ifdef __cplusplus 1711 } 1712 #endif 1713 1714 #undef NLM_EXTERN 1715 #ifdef NLM_EXPORT 1716 #define NLM_EXTERN NLM_EXPORT 1717 #else 1718 #define NLM_EXTERN 1719 #endif 1720 1721 #endif 1722 1723 1724 /***************************************************************************** 1725 * 1726 * Allowed IUPAC nucleic acid codes from /ncbi/data/seqcode.prt 1727 * 1728 ( symbol "A", name "Adenine" ), 1729 ( symbol "B" , name "G or T or C" ), 1730 ( symbol "C", name "Cytosine" ), 1731 ( symbol "D", name "G or A or T" ), 1732 ( symbol "G", name "Guanine" ), 1733 ( symbol "H", name "A or C or T" ) , 1734 ( symbol "K", name "G or T" ), 1735 ( symbol "M", name "A or C" ), 1736 ( symbol "N", name "A or G or C or T" ) , 1737 ( symbol "R", name "G or A"), 1738 ( symbol "S", name "G or C"), 1739 ( symbol "T", name "Thymine"), 1740 ( symbol "V", name "G or C or A"), 1741 ( symbol "W", name "A or T" ), 1742 ( symbol "Y", name "T or C") 1743 * 1744 * 1745 *****************************************************************************/ 1746 1747 /***************************************************************************** 1748 * 1749 * Allowed IUPAC amino acid codes from /ncbi/data/seqcode.prt 1750 1751 ( symbol "A", name "Alanine" ), 1752 ( symbol "B" , name "Asp or Asn" ), 1753 ( symbol "C", name "Cysteine" ), 1754 ( symbol "D", name "Aspartic Acid" ), 1755 ( symbol "E", name "Glutamic Acid" ), 1756 ( symbol "F", name "Phenylalanine" ), 1757 ( symbol "G", name "Glycine" ), 1758 ( symbol "H", name "Histidine" ) , 1759 ( symbol "I", name "Isoleucine" ), 1760 ( symbol "J", name "Leu or Ile" ), 1761 ( symbol "K", name "Lysine" ), 1762 ( symbol "L", name "Leucine" ), 1763 ( symbol "M", name "Methionine" ), 1764 ( symbol "N", name "Asparagine" ) , 1765 ( symbol "O", name "Pyrrolysine" ), 1766 ( symbol "P", name "Proline" ), 1767 ( symbol "Q", name "Glutamine"), 1768 ( symbol "R", name "Arginine"), 1769 ( symbol "S", name "Serine"), 1770 ( symbol "T", name "Threoine"), 1771 { symbol "U", name "Selenocysteine"}, 1772 ( symbol "V", name "Valine"), 1773 ( symbol "W", name "Tryptophan" ), 1774 ( symbol "X", name "Undetermined or atypical"), 1775 ( symbol "Y", name "Tyrosine"), 1776 ( symbol "Z", name "Glu or Gln" ) 1777 * 1778 * 1779 *****************************************************************************/ 1780 1781 /***************************************************************************** 1782 * 1783 * Genetic Code id's and names from /ncbi/data/gc.prt 1784 * gc.prt lists the legal start codons and genetic codes fully 1785 * 1786 name "Standard" , 1787 id 1 , 1788 1789 name "Vertebrate Mitochondrial" , 1790 id 2 , 1791 1792 name "Yeast Mitochondrial" , 1793 id 3 , 1794 1795 name "Mold Mitochondrial and Mycoplasma" , 1796 id 4 , 1797 1798 name "Invertebrate Mitochondrial" , 1799 id 5 , 1800 1801 name "Ciliate Macronuclear and Daycladacean" , 1802 id 6 , 1803 1804 name "Echinoderm Mitochondrial" , 1805 id 9 , 1806 1807 name "Euplotid Macronuclear" , 1808 id 10 , 1809 1810 name "Bacterial and Plant Plastid" , 1811 id 11 , 1812 1813 name "Alternative Yeast Nuclear" , 1814 id 12 , 1815 1816 name "Ascidian Mitochondrial" , 1817 id 13 , 1818 1819 name "Alternative Flatworm Mitochondrial" , 1820 id 14 , 1821 1822 name "Blepharisma Macronuclear" , 1823 id 15 , 1824 1825 name "Chlorophycean Mitochondrial" , 1826 id 16 , 1827 1828 name "Trematode Mitochondrial" , 1829 id 21 , 1830 1831 name "Scenedesmus obliquus Mitochondrial" , 1832 id 22 , 1833 1834 name "Thraustochytrium Mitochondrial" , 1835 id 23 , 1836 1837 * 1838 * 1839 *****************************************************************************/ 1840 1841 1842 1843 1844