1 /* sqnutils.h 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information (NCBI) 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government do not place any restriction on its use or reproduction. 13 * We would, however, appreciate having the NCBI and the author cited in 14 * any work or product based on this material 15 * 16 * Although all reasonable efforts have been taken to ensure the accuracy 17 * and reliability of the software and data, the NLM and the U.S. 18 * Government do not and cannot warrant the performance or results that 19 * may be obtained by using this software or data. The NLM and the U.S. 20 * Government disclaim all warranties, express or implied, including 21 * warranties of performance, merchantability or fitness for any particular 22 * purpose. 23 * 24 * =========================================================================== 25 * 26 * File Name: sqnutils.h 27 * 28 * Author: Jonathan Kans 29 * 30 * Version Creation Date: 9/2/97 31 * 32 * $Revision: 6.747 $ 33 * 34 * File Description: 35 * 36 * Modifications: 37 * -------------------------------------------------------------------------- 38 * Date Name Description of modification 39 * ------- ---------- ----------------------------------------------------- 40 * 41 * 42 * ========================================================================== 43 */ 44 45 #ifndef _SQNUTILS_ 46 #define _SQNUTILS_ 47 48 #include <ncbi.h> 49 #include <sequtil.h> 50 #include <objpubme.h> 51 #include <objentgene.h> 52 #include <util/creaders/alnread.h> 53 #include <subutil.h> 54 55 #undef NLM_EXTERN 56 #ifdef NLM_IMPORT 57 #define NLM_EXTERN NLM_IMPORT 58 #else 59 #define NLM_EXTERN extern 60 #endif 61 62 #ifdef __cplusplus 63 extern "C" { 64 #endif 65 66 typedef void (*Nlm_ChangeNotifyProc) PROTO ((Pointer)); 67 68 NLM_EXTERN SeqEntryPtr LIBCALL GetTopSeqEntryForEntityID (Uint2 entityID); 69 NLM_EXTERN SeqEntryPtr LIBCALL GetBestTopParentForData (Uint2 entityID, BioseqPtr bsp); 70 NLM_EXTERN SeqEntryPtr LIBCALL GetBestTopParentForItemID (Uint2 entityID, Uint4 itemID, Uint2 itemtype); 71 72 NLM_EXTERN SeqEntryPtr LIBCALL GetBestTopParentForDataEx (Uint2 entityID, BioseqPtr bsp, Boolean skipGenProdSet); 73 NLM_EXTERN SeqEntryPtr LIBCALL GetBestTopParentForItemIDEx (Uint2 entityID, Uint4 itemID, Uint2 itemtype, Boolean skipGenProdSet); 74 75 NLM_EXTERN SeqIdPtr SeqIdFindWorst (SeqIdPtr sip); 76 NLM_EXTERN void ChangeSeqIdToWorstID (SeqIdPtr sip); 77 NLM_EXTERN void ChangeSeqLocToWorstID (SeqLocPtr slp); 78 79 NLM_EXTERN SeqIdPtr MakeSeqID (CharPtr str); 80 NLM_EXTERN SeqIdPtr MakeUniqueSeqID (CharPtr prefix); 81 82 NLM_EXTERN DatePtr DateAdvance (DatePtr dp, Uint1 monthsToAdd); 83 84 NLM_EXTERN SeqEntryPtr LIBCALL FindNthSeqEntry (SeqEntryPtr sep, Int2 seq); 85 NLM_EXTERN SeqEntryPtr LIBCALL FindNthBioseq (SeqEntryPtr sep, Int2 seq); 86 NLM_EXTERN SeqEntryPtr LIBCALL FindNthSequinEntry (SeqEntryPtr sep, Int2 seq); 87 NLM_EXTERN SeqEntryPtr LIBCALL FindNucSeqEntry (SeqEntryPtr sep); 88 NLM_EXTERN BioseqPtr LIBCALL FindNucBioseq (SeqEntryPtr sep); 89 NLM_EXTERN SeqEntryPtr LIBCALL FindBioseqSetByClass (SeqEntryPtr sep, Uint1 _class); 90 91 NLM_EXTERN Boolean LIBCALL SeqEntryHasNucs (SeqEntryPtr sep); 92 NLM_EXTERN Boolean LIBCALL SeqEntryHasProts (SeqEntryPtr sep); 93 NLM_EXTERN Boolean LIBCALL SeqEntryHasAligns (Uint2 entityID, SeqEntryPtr sep); 94 NLM_EXTERN Boolean LIBCALL PowerBLASTASN1Detected (SeqEntryPtr sep); 95 96 NLM_EXTERN Int2 EntityIDToGeneticCode (Uint2 entityID, BoolPtr mito, CharPtr taxname, size_t maxsize); 97 NLM_EXTERN Int2 SeqEntryToGeneticCode (SeqEntryPtr sep, BoolPtr mito, CharPtr taxname, size_t maxsize); 98 NLM_EXTERN Int2 SeqEntryToBioSource (SeqEntryPtr sep, BoolPtr mito, CharPtr taxname, size_t maxsize, BioSourcePtr PNTR biopp); 99 100 NLM_EXTERN Boolean BioseqToGeneticCode ( 101 BioseqPtr bsp, 102 Int2Ptr gencodep, 103 BoolPtr mitop, 104 BoolPtr plastidp, 105 CharPtr taxnamep, 106 size_t maxsize, 107 BioSourcePtr PNTR biopp 108 ); 109 110 NLM_EXTERN SeqLocPtr CreateWholeInterval (SeqEntryPtr sep); 111 NLM_EXTERN SeqFeatPtr CreateNewFeature (SeqEntryPtr sep, SeqEntryPtr placeHere, Uint1 choice, SeqFeatPtr useThis); 112 NLM_EXTERN ValNodePtr CreateNewDescriptor (SeqEntryPtr sep, Uint1 choice); 113 114 NLM_EXTERN SeqLocPtr WholeIntervalFromSeqId (SeqIdPtr sip); 115 116 NLM_EXTERN Boolean IsPopPhyEtcSet (Uint1 _class); 117 118 /* Variants that call SeqMgrGetSeqEntryForData. The feature version allows a location 119 to be specified, overriding the default full-length seq-int location. (If location is 120 not NULL, it copies it after deleting the existing sfp->location.) For both functions 121 you still need to set the sfp->data.value.ptrvalue of the sdp->data.ptrvalue. */ 122 NLM_EXTERN SeqFeatPtr CreateNewFeatureOnBioseq (BioseqPtr bsp, Uint1 choice, SeqLocPtr slp); 123 NLM_EXTERN ValNodePtr CreateNewDescriptorOnBioseq (BioseqPtr bsp, Uint1 choice); 124 125 NLM_EXTERN void UpdateLocalId (BioseqPtr bsp, CharPtr localId); 126 NLM_EXTERN void UpdateTitle (BioseqPtr bsp, CharPtr title); 127 128 NLM_EXTERN GeneRefPtr CreateNewGeneRef (CharPtr locus, CharPtr allele, 129 CharPtr desc, Boolean pseudo); 130 NLM_EXTERN ProtRefPtr CreateNewProtRef (CharPtr name, CharPtr desc, 131 CharPtr ec, CharPtr activity); 132 NLM_EXTERN CdRegionPtr CreateNewCdRgn (Uint1 frame, Boolean orf, Int2 genCode); 133 134 NLM_EXTERN void SetSeqFeatData (SeqFeatPtr sfp, Pointer data); 135 NLM_EXTERN void SetSeqFeatProduct (SeqFeatPtr sfp, BioseqPtr bsp); 136 NLM_EXTERN void ResetSeqFeatInterval (SeqFeatPtr sfp); 137 138 NLM_EXTERN void AddSeqFeatInterval (SeqFeatPtr sfp, BioseqPtr bsp, Int4 from, Int4 to, 139 Boolean partial5, Boolean partial3); 140 141 NLM_EXTERN void AddSeqLocPoint (SeqLocPtr PNTR old_slp, SeqIdPtr sip, Int4 location, 142 Boolean fuzz_before, Boolean fuzz_after, Int2 strand); 143 NLM_EXTERN void AddSeqFeatPoint (SeqFeatPtr sfp, BioseqPtr bsp, Int4 location, Boolean fuzz_before, Boolean fuzz_after, Int2 strand); 144 145 /* AddSeqEntryToSeqEntry and ReplaceSeqEntryWithSeqEntry leave 146 the original target sep pointing to the new structure. */ 147 148 NLM_EXTERN void AddSeqEntryToSeqEntry (SeqEntryPtr target, SeqEntryPtr insert, Boolean relink); 149 NLM_EXTERN void ReplaceSeqEntryWithSeqEntry (SeqEntryPtr target, SeqEntryPtr replaceWith, Boolean relink); 150 151 NLM_EXTERN void RemoveSeqEntryFromSeqEntry (SeqEntryPtr top, SeqEntryPtr del, Boolean relink); 152 NLM_EXTERN Int4 RenormalizeNucProtSets (SeqEntryPtr sep, Boolean relink); 153 NLM_EXTERN Int4 RemoveSingleItemSet (SeqEntryPtr sep, Boolean relink); 154 155 /* The following functions are called by the above when relink is TRUE. Examine the 156 code of ReplaceSeqEntryWithSeqEntry (in dlgutil2.c) to see how relink is treated. */ 157 158 NLM_EXTERN void GetSeqEntryParent (SeqEntryPtr target, Pointer PNTR parentptr, Uint2Ptr parenttype); 159 160 NLM_EXTERN void SaveSeqEntryObjMgrData (SeqEntryPtr target, ObjMgrDataPtr PNTR omdptopptr, ObjMgrData PNTR omdataptr); 161 NLM_EXTERN void RestoreSeqEntryObjMgrData (SeqEntryPtr target, ObjMgrDataPtr omdptop, ObjMgrData PNTR omdataptr); 162 163 /* If relink FALSE, call SeqMgrLinkSeqEntry (target, parenttype, parentptr) 164 with original parent after all sequences have been added to the target. */ 165 166 /* If relink FALSE, call SaveSeqEntryObjMgrData with the address of temporary 167 ObjMgrDataPtr and ObjMgrData variables, and after calling SeqMgrLinkSeqEntry to 168 update the link table, call RestoreSeqEntryObjMgrData with the value of the 169 temporary ObjMgrDataPtr and the address of the ObjMgrData variable. */ 170 171 /* ExtractBioSourceAndPubs and ReplaceBioSourceAndPubs can be called before and 172 after AddSeqEntryToSeqEntry to propagate source and pub descriptors to top level. */ 173 174 NLM_EXTERN ValNodePtr ExtractBioSourceAndPubs (SeqEntryPtr sep); 175 NLM_EXTERN void ReplaceBioSourceAndPubs (SeqEntryPtr sep, ValNodePtr descr); 176 177 /* SeqLocMerge combines feature intervals. It can be used to extend the gene feature 178 intervals, and (eventually) to fuse mutliple features into one. */ 179 180 NLM_EXTERN SeqLocPtr SeqLocMerge (BioseqPtr target, 181 SeqLocPtr to, SeqLocPtr from, 182 Boolean single_interval, Boolean fuse_joints, 183 Boolean add_null); 184 185 NLM_EXTERN SeqLocPtr SeqLocMergeEx (BioseqPtr target, SeqLocPtr to, SeqLocPtr from, 186 Boolean single_interval, Boolean fuse_joints, 187 Boolean merge_overlaps, Boolean add_null); 188 189 NLM_EXTERN SeqLocPtr SeqLocMergeExEx ( 190 BioseqPtr target, 191 SeqLocPtr to, 192 SeqLocPtr from, 193 Boolean single_interval, 194 Boolean fuse_joints, 195 Boolean merge_overlaps, 196 Boolean add_null, 197 Boolean ignore_mixed, 198 Boolean ignore_out_of_order, 199 Boolean relaxed 200 ); 201 202 NLM_EXTERN Boolean CheckSeqLocForPartial (SeqLocPtr location, BoolPtr p5ptr, BoolPtr p3ptr); 203 NLM_EXTERN void SetSeqLocPartial (SeqLocPtr location, Boolean partial5, Boolean partial3); 204 NLM_EXTERN void FreeAllFuzz (SeqLocPtr location); 205 NLM_EXTERN Boolean LocationHasNullsBetween (SeqLocPtr location); 206 NLM_EXTERN void NormalizeNullsBetween (SeqLocPtr location); 207 NLM_EXTERN ValNodePtr GetSeqLocPartialSet (SeqLocPtr location); 208 NLM_EXTERN void SetSeqLocPartialSet (SeqLocPtr location, ValNodePtr vnp); 209 NLM_EXTERN Boolean SeqLocBadSortOrder (BioseqPtr bsp, SeqLocPtr slp); 210 NLM_EXTERN Boolean SeqLocMixedStrands (BioseqPtr bsp, SeqLocPtr slp); 211 /* Check/SetSeqLocPartialEx take lim argument - 3 is tr, 4 is tl */ 212 NLM_EXTERN Boolean CheckSeqLocForPartialEx (SeqLocPtr location, BoolPtr p5ptr, BoolPtr p3ptr, Int4Ptr limptr); 213 NLM_EXTERN void SetSeqLocPartialEx (SeqLocPtr location, Boolean partial5, Boolean partial3, Int4 lim); 214 215 /* GetBioseqGivenSeqLoc returns a segmented bioseq if the SeqLoc is to the parts */ 216 217 NLM_EXTERN BioseqPtr GetBioseqGivenSeqLoc (SeqLocPtr slp, Uint2 entityID); 218 219 NLM_EXTERN BioseqPtr GetBioseqGivenIDs (Uint2 entityID, Uint4 itemID, Uint2 itemtype); 220 NLM_EXTERN Uint4 GetItemIDGivenPointer (Uint2 entityID, Uint2 itemtype, Pointer lookfor); 221 222 NLM_EXTERN Uint1 FindFeatFromFeatDefType (Uint2 subtype); 223 NLM_EXTERN Uint1 FindFeatDefTypeFromKey (CharPtr key); 224 NLM_EXTERN CharPtr FindKeyFromFeatDefType (Uint1 type, Boolean forGBFF); 225 226 NLM_EXTERN Uint1 CodonToGcIndex (CharPtr codon); 227 NLM_EXTERN CharPtr GcIndextoCodon (Uint1 index); 228 229 NLM_EXTERN GBQualPtr SortFeatureGBQuals (GBQualPtr list); 230 NLM_EXTERN void CleanupDuplicateGBQuals (GBQualPtr PNTR prevgbq); 231 232 /* finds bioseq from (cds) product, gets largest protein feature packaged on it */ 233 234 NLM_EXTERN SeqFeatPtr LIBCALL GetBestProteinFeatureUnindexed (SeqLocPtr product); 235 236 /* set coding region partial flags by initial dash and final star in translation */ 237 238 NLM_EXTERN void CodingRegionPartialsFromTranslation (SeqEntryPtr sep); 239 240 /* impose coding region partial flags onto appropriate mRNA and gene features */ 241 242 NLM_EXTERN void ImposeCodingRegionPartials (SeqEntryPtr sep); 243 244 /* resynchronizes coding regions with product protein bioseq molinfo and protein feature */ 245 246 NLM_EXTERN void ResynchCodingRegionPartials (SeqEntryPtr sep); 247 NLM_EXTERN Boolean ResynchCodingRegionPartialsEx (SeqEntryPtr sep, FILE *log_fp); 248 249 /* resynchronizes mRNAs with product cDNA bioseq */ 250 251 NLM_EXTERN void ResynchMessengerRNAPartials (SeqEntryPtr sep); 252 253 /* resynchronizes protein feature with product peptide bioseq */ 254 255 NLM_EXTERN void ResynchProteinPartials (SeqEntryPtr sep); 256 257 /* individual feature callbacks for above functions */ 258 259 NLM_EXTERN void CDSPartialsFromTranslation (SeqFeatPtr sfp, Pointer userdata); 260 NLM_EXTERN void ImposeCDSPartials (SeqFeatPtr sfp, Pointer userdata); 261 NLM_EXTERN void ImposeGenePartials (SeqFeatPtr sfp, Pointer userdata); 262 NLM_EXTERN void ResynchMRNAPartials (SeqFeatPtr sfp, Pointer userdata); 263 NLM_EXTERN void ResynchCDSPartials (SeqFeatPtr sfp, Pointer userdata); 264 NLM_EXTERN void ResynchPeptidePartials (SeqFeatPtr sfp, Pointer userdata); 265 266 /* functions for associating CDS and parent mRNA using featureIDs */ 267 268 NLM_EXTERN void ClearFeatIDs (SeqFeatPtr sfp); 269 NLM_EXTERN void ClearFeatIDXrefs (SeqFeatPtr sfp); 270 271 NLM_EXTERN void ClearFeatureIDs (SeqEntryPtr sep); 272 NLM_EXTERN Int4 FindHighestFeatureID (SeqEntryPtr sep); 273 274 NLM_EXTERN void AssignFeatureIDs (SeqEntryPtr sep); 275 NLM_EXTERN void AssignFeatureIDsWithOffset (SeqEntryPtr sep, Int4Ptr last_used_id, Int4Ptr last_used_ref); 276 277 NLM_EXTERN void OffsetFeatureIDs (SeqEntryPtr sep, Int4 offset); 278 NLM_EXTERN void OffsetFeatureIDXrefs (SeqEntryPtr sep, Int4 offset); 279 280 NLM_EXTERN void ReassignFeatureIDs (SeqEntryPtr sep); 281 282 NLM_EXTERN void LinkCDSmRNAbyOverlap (SeqEntryPtr sep); 283 NLM_EXTERN void LinkCDSmRNAbyProduct (SeqEntryPtr sep); 284 NLM_EXTERN void LinkCDSmRNAbyLabel (SeqEntryPtr sep); 285 NLM_EXTERN void LinkCDSmRNAbyLabelAndLocation (SeqEntryPtr sep); 286 287 NLM_EXTERN void StripFeatIDXrefAsnFilter (AsnIoPtr aip, AsnIoPtr aop); 288 NLM_EXTERN void StripSeqDataGapAsnFilter (AsnIoPtr aip, AsnIoPtr aop); 289 NLM_EXTERN void StripNewFeatMolInfoFieldsAsnFilter (AsnIoPtr aip, AsnIoPtr aop); 290 NLM_EXTERN void StripPCRPrimerAsnFilter (AsnIoPtr aip, AsnIoPtr aop); 291 NLM_EXTERN void StripOrgNamePgcodeAsnFilter (AsnIoPtr aip, AsnIoPtr aop); 292 NLM_EXTERN void StripGeneRnaPcrAsnFilter (AsnIoPtr aip, AsnIoPtr aop); 293 NLM_EXTERN void StripSeqFeatSupportAsnFilter (AsnIoPtr aip, AsnIoPtr aop); 294 295 /* functions to parse [org=Drosophila melanogaster] and [gene=lacZ] from titles */ 296 /* for example, passing "gene" to SqnTagFind returns "lacZ" */ 297 298 #define MAX_SQN_TAGS 200 299 300 typedef struct sqntag { 301 CharPtr query; 302 Int2 num_tags; 303 CharPtr tag [MAX_SQN_TAGS]; 304 CharPtr val [MAX_SQN_TAGS]; 305 Boolean used [MAX_SQN_TAGS]; 306 } SqnTag, PNTR SqnTagPtr; 307 308 NLM_EXTERN SqnTagPtr SqnTagParse (CharPtr ttl); 309 NLM_EXTERN SqnTagPtr SqnTagFree (SqnTagPtr stp); 310 311 NLM_EXTERN CharPtr SqnTagFind (SqnTagPtr stp, CharPtr tag); 312 NLM_EXTERN ValNodePtr SqnTagFindMultiple (SqnTagPtr stp, CharPtr tag); 313 NLM_EXTERN CharPtr SqnTagFindUnused (SqnTagPtr stp, CharPtr tag); 314 315 NLM_EXTERN void ReadTechFromString (CharPtr str, MolInfoPtr mip); 316 NLM_EXTERN void ReadCompletenessFromString (CharPtr str, MolInfoPtr mip); 317 318 extern Boolean StringsAreEquivalent (CharPtr str1, CharPtr str2); 319 NLM_EXTERN Uint1 EquivalentSubSource (CharPtr str); 320 NLM_EXTERN Uint1 EquivalentOrgMod (CharPtr str); 321 NLM_EXTERN Uint1 EquivalentSubSourceEx (CharPtr str, Boolean allow_discouraged_and_discontinued); 322 NLM_EXTERN Uint1 EquivalentOrgModEx (CharPtr str, Boolean allow_discouraged_and_discontinued); 323 324 /* functions to extract BioSource, MolInfo, and Bioseq information from parsed titles */ 325 326 NLM_EXTERN BioSourcePtr ParseTitleIntoBioSource ( 327 SqnTagPtr stp, 328 CharPtr organism, 329 BioSourcePtr biop 330 ); 331 332 NLM_EXTERN MolInfoPtr ParseTitleIntoMolInfo ( 333 SqnTagPtr stp, 334 MolInfoPtr mip 335 ); 336 337 NLM_EXTERN BioseqPtr ParseTitleIntoBioseq ( 338 SqnTagPtr stp, 339 BioseqPtr bsp 340 ); 341 342 NLM_EXTERN GeneRefPtr ParseTitleIntoGeneRef ( 343 SqnTagPtr stp, 344 GeneRefPtr grp 345 ); 346 347 NLM_EXTERN ProtRefPtr ParseTitleIntoProtRef ( 348 SqnTagPtr stp, 349 ProtRefPtr prp 350 ); 351 352 NLM_EXTERN GBBlockPtr ParseTitleIntoGenBank ( 353 SqnTagPtr stp, 354 GBBlockPtr gbp 355 ); 356 357 NLM_EXTERN SeqHistPtr ParseTitleIntoSeqHist ( 358 SqnTagPtr stp, 359 SeqHistPtr shp 360 ); 361 362 NLM_EXTERN SeqHistPtr ParseStringIntoSeqHist ( 363 SeqHistPtr shp, 364 CharPtr str 365 ); 366 367 NLM_EXTERN void ParseTitleIntoSubmitBlock ( 368 SqnTagPtr stp, 369 SubmitBlockPtr sbp 370 ); 371 372 NLM_EXTERN UserObjectPtr ParseTitleIntoTpaAssembly ( 373 SqnTagPtr stp, 374 UserObjectPtr uop 375 ); 376 377 NLM_EXTERN UserObjectPtr ParseTitleIntoGenomeProjectsDB ( 378 SqnTagPtr stp, 379 UserObjectPtr uop 380 ); 381 382 NLM_EXTERN void AddFieldStringToDbLinkUserObject ( 383 CharPtr str, 384 CharPtr field_name, 385 UserObjectPtr uop 386 ); 387 388 NLM_EXTERN UserObjectPtr ParseTitleIntoDBLinkBioProject ( 389 SqnTagPtr stp, 390 UserObjectPtr uop 391 ); 392 393 NLM_EXTERN UserObjectPtr ParseTitleIntoDBLinkBioSample ( 394 SqnTagPtr stp, 395 UserObjectPtr uop 396 ); 397 398 NLM_EXTERN UserObjectPtr ParseTitleIntoDBLinkSeqReadArch ( 399 SqnTagPtr stp, 400 UserObjectPtr uop 401 ); 402 403 NLM_EXTERN Boolean IsGenomeProjectIDDescriptor (SeqDescrPtr sdp); 404 NLM_EXTERN SeqDescrPtr GetGenomeProjectIDDescriptor (BioseqPtr bsp); 405 NLM_EXTERN Int4 GetGenomeProjectID (BioseqPtr bsp); 406 407 NLM_EXTERN CharPtr GetTSAIDDB (BioseqPtr bsp); 408 409 410 NLM_EXTERN void AddPubsFromTitle ( 411 SqnTagPtr stp, 412 SeqDescrPtr PNTR desc_list 413 ); 414 415 /* structured comment user object for flatfile presentation */ 416 417 NLM_EXTERN UserObjectPtr ParseStringIntoStructuredComment ( 418 UserObjectPtr uop, 419 CharPtr str, 420 CharPtr prefix, 421 CharPtr suffix 422 ); 423 424 425 /* UseLocalAsnloadDataAndErrMsg transiently sets paths to asnload, data, and errmsg 426 if they are packaged in the same directory as the executing program. */ 427 428 NLM_EXTERN Boolean UseLocalAsnloadDataAndErrMsg (void); 429 430 /* GetRidOfLocusInSeqIds strips locus from all feature location and product seqIds */ 431 432 NLM_EXTERN void GetRidOfLocusInSeqIds (Uint2 entityID, SeqEntryPtr sep); 433 434 NLM_EXTERN SeqLocPtr StripLocusFromSeqLoc (SeqLocPtr location); 435 NLM_EXTERN SeqIdPtr SeqIdStripLocus (SeqIdPtr sip); 436 437 /* LeaveBestCDD removes all but best CDD region in an area of overlapping features */ 438 439 NLM_EXTERN void LeaveBestCDD (SeqEntryPtr sep); 440 441 /* ConvertPubSrcComDescsToFeats is useful when merging records */ 442 443 NLM_EXTERN Boolean ConvertPubSrcComDescsToFeats (SeqEntryPtr sep, Boolean pub, Boolean src, Boolean com, Boolean toProts, Boolean PNTR asked_about_prop, Boolean PNTR propagate_descriptions, CharPtr findstring); 444 445 NLM_EXTERN void DeleteMultipleTitles (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent); 446 447 NLM_EXTERN Uint1 FindTrnaAA (CharPtr str); 448 NLM_EXTERN Uint1 FindTrnaAA3 (CharPtr str); 449 NLM_EXTERN CharPtr GetLongSymbolForAA (Char aa); 450 NLM_EXTERN Uint1 ParseTRnaString (CharPtr strx, BoolPtr justTrnaText, Uint1Ptr codon, Boolean noSingleLetter); 451 NLM_EXTERN CharPtr FindTrnaAAIndex (CharPtr str); 452 NLM_EXTERN Char FindResidueByName (CharPtr res_name, SeqCodeTablePtr sctp); 453 NLM_EXTERN ValNodePtr TokenizeTRnaString (CharPtr strx); 454 NLM_EXTERN Boolean ParseDegenerateCodon (tRNAPtr trp, Uint1Ptr codon); 455 NLM_EXTERN Boolean SerialNumberInString (CharPtr str); 456 457 /* ModernizeRNAFields uses new RNAGenPtr choice of RnaRef.ext for misc_RNA, ncRNA, tmRNA */ 458 459 NLM_EXTERN void ModernizeRNAFields ( 460 SeqFeatPtr sfp 461 ); 462 463 /* ModernizeGeneFields populates new GeneNomenclaturePtr field from OfficialNomenclature user object */ 464 465 NLM_EXTERN void ModernizeGeneFields ( 466 SeqFeatPtr sfp 467 ); 468 469 /* for sorting and uniquing valnode list by (charptr) data.ptrvalue (with case sensitive/insensitive variants) */ 470 471 NLM_EXTERN int LIBCALLBACK SortVnpByStringCS (VoidPtr ptr1, VoidPtr ptr2); 472 NLM_EXTERN int LIBCALLBACK SortVnpByStringCI (VoidPtr ptr1, VoidPtr ptr2); 473 NLM_EXTERN int LIBCALLBACK SortVnpByStringCIUCFirst (VoidPtr ptr1, VoidPtr ptr2); 474 NLM_EXTERN int LIBCALLBACK SortVnpByStringCILCFirst (VoidPtr ptr1, VoidPtr ptr2); 475 476 NLM_EXTERN ValNodePtr UniqueStringValNodeCS (ValNodePtr list); 477 NLM_EXTERN ValNodePtr UniqueStringValNodeCI (ValNodePtr list); 478 479 NLM_EXTERN int LIBCALLBACK SortVnpByNaturalCS (VoidPtr ptr1, VoidPtr ptr2); 480 NLM_EXTERN int LIBCALLBACK SortVnpByNaturalCI (VoidPtr ptr1, VoidPtr ptr2); 481 482 NLM_EXTERN int LIBCALLBACK SortVnpByString (VoidPtr ptr1, VoidPtr ptr2); 483 NLM_EXTERN ValNodePtr UniqueValNode (ValNodePtr list); 484 485 /* for sorting valnode list by choice */ 486 487 NLM_EXTERN int LIBCALLBACK SortByChoice (VoidPtr ptr1, VoidPtr ptr2); 488 489 /* for sorting and uniquing valnode list by data.intvalue */ 490 491 NLM_EXTERN int LIBCALLBACK SortByIntvalue (VoidPtr ptr1, VoidPtr ptr2); 492 NLM_EXTERN ValNodePtr UniqueIntValNode (ValNodePtr list); 493 494 /* for sorting and uniquing valnode list by data.ptrvalue */ 495 496 NLM_EXTERN int LIBCALLBACK SortByPtrvalue (VoidPtr ptr1, VoidPtr ptr2); 497 NLM_EXTERN ValNodePtr UniquePtrValNode (ValNodePtr list); 498 499 /* keytag sorts/uniques and then owns valnode character list */ 500 501 typedef struct keytag { 502 Int2 num; 503 ValNodePtr list; 504 CharPtr PNTR index; /* elements point into above valnode list */ 505 } KeyTag; /* used as substructure, not allocated separately */ 506 507 NLM_EXTERN void KeyTagInit (KeyTag PNTR ktp, ValNodePtr list); 508 NLM_EXTERN void KeyTagClear (KeyTag PNTR ktp); 509 510 NLM_EXTERN Int2 KeyFromTag (KeyTag PNTR ktp, CharPtr tag); 511 NLM_EXTERN CharPtr TagFromKey (KeyTag PNTR ktp, Int2 key); 512 513 /* inference qualifier utility */ 514 515 #define VALID_INFERENCE 0 516 #define EMPTY_INFERENCE_STRING 1 517 #define BAD_INFERENCE_PREFIX 2 518 #define BAD_INFERENCE_BODY 3 519 #define SINGLE_INFERENCE_FIELD 4 520 #define SPACES_IN_INFERENCE 5 521 #define INFERENCE_HAS_COMMENT 6 522 #define SAME_SPECIES_MISUSED 7 523 #define BAD_INFERENCE_ACCESSION 8 524 #define BAD_INFERENCE_ACC_VERSION 9 525 #define ACC_VERSION_NOT_PUBLIC 10 526 #define BAD_ACCESSION_TYPE 11 527 #define UNRECOGNIZED_DATABASE 12 528 529 NLM_EXTERN Int2 ValidateInferenceQualifier (CharPtr val, Boolean fetchAccn); 530 531 532 /* from Colombe */ 533 NLM_EXTERN SeqLocPtr StringSearchInBioseq (SeqIdPtr sip, CharPtr sub); 534 535 /***************************************************************************** 536 * 537 * SequinEntryList (sep, mydata, mycallback, index, indent) 538 * traverses all Seq-entry nodes beginning with sep 539 * calls mycallback () at each node 540 * Does enter BioseqSets of _class "parts", but ignores the 541 * parts set itself 542 * 543 *****************************************************************************/ 544 545 NLM_EXTERN Int4 SequinEntryList (SeqEntryPtr sep, Pointer mydata, SeqEntryFunc mycallback, Int4 index, Int2 indent); 546 547 #define SequinEntryCount( a ) SequinEntryList( a ,NULL,NULL,0,0) 548 #define SequinEntryExplore(a,b,c) SequinEntryList(a, b, c, 0L, 0) 549 550 /* Phrap reading function, based on sample code supplied by C. Magness, returns a SeqEntry list 551 of Bioseqs containing SeqGraphs, with individual reads removed and only contigs remaining */ 552 553 NLM_EXTERN SeqEntryPtr ReadPhrapFile (FILE *fp); 554 555 /* Internal function to read quality scores, made available to parse separate DNA and quality score files */ 556 557 NLM_EXTERN SeqGraphPtr ReadPhrapQuality (FILE *fp, BioseqPtr bsp); 558 NLM_EXTERN SeqGraphPtr ReadPhrapQualityFC (FileCachePtr fcp, BioseqPtr bsp); 559 560 /* SetPhrapContigOrder takes the results of ReadPhrapFile and a string indicating the order 561 of contigs, and returns a SeqEntryList in the desired order, with all other contigs removed */ 562 563 NLM_EXTERN SeqEntryPtr SetPhrapContigOrder (SeqEntryPtr head, CharPtr contigs); 564 565 NLM_EXTERN void PrintQualityScores (BioseqPtr bsp, FILE *fp); 566 567 NLM_EXTERN void TrimSeqGraph (SeqGraphPtr sgp, Int4 num_to_trim, Boolean from_left); 568 NLM_EXTERN void TrimQualityScores (BioseqPtr bsp, Int4 num_to_trim, Boolean from_left); 569 570 NLM_EXTERN void ReverseSeqGraph (SeqGraphPtr sgp); 571 NLM_EXTERN void ReverseQualityScores (BioseqPtr bsp); 572 573 574 typedef void (*QualityWriteFunc) (CharPtr buf, Uint4 buflen, Pointer userdata); 575 576 NLM_EXTERN void PrintQualityScoresToBuffer (BioseqPtr bsp, Boolean gapIsZero, Pointer userdata, QualityWriteFunc callback); 577 578 /* special function for genome contig delta sequences with far pointers */ 579 580 NLM_EXTERN void PrintQualityScoresForContig (BioseqPtr bsp, Boolean gapIsZero, FILE* fp); 581 582 /* more efficient function for far genomic contig, makes separate graphs */ 583 584 NLM_EXTERN SeqAnnotPtr PhrapGraphForContig (BioseqPtr bsp); 585 586 /* ReadContigList builds a far segmented bioseq from a table of accessions, starts, stops, 587 lengths, and (optional) strands. Gaps of a given length (with 0 start and stop) are also 588 allowed. */ 589 590 NLM_EXTERN SeqEntryPtr ReadContigList (FILE *fp, Boolean coordinatesOnMaster); 591 NLM_EXTERN SeqEntryPtr ReadContigListEx (FILE *fp, Boolean coordinatesOnMaster, CharPtr seqid, CharPtr title); 592 593 /* ReadAsnFastaOrFlatFile reads object manager-registered ASN.1, FASTA, GenBank, EMBL, GenPept, 594 Feature table, Restriction table, Contig table, Message response, or saved UID list, with the 595 option of saving FASTA results as OBJ_FASTA (SimpleSeq) to avoid ID collisions */ 596 597 NLM_EXTERN Pointer ReadAsnFastaOrFlatFileEx (FILE *fp, Uint2Ptr datatypeptr, Uint2Ptr entityIDptr, 598 Boolean forceNuc, Boolean forceProt, 599 Boolean parseFastaSeqId, Boolean fastaAsSimpleSeq, 600 BoolPtr chars_stripped); 601 NLM_EXTERN Pointer ReadAsnFastaOrFlatFile (FILE *fp, Uint2Ptr datatypeptr, Uint2Ptr entityIDptr, 602 Boolean forceNuc, Boolean forceProt, 603 Boolean parseFastaSeqId, Boolean fastaAsSimpleSeq); 604 605 /* ReadFeatureTableFile only handles >Feature tables */ 606 607 NLM_EXTERN Pointer ReadFeatureTableFile ( 608 FILE *fp, 609 Uint2Ptr datatypeptr, 610 Uint2Ptr entityIDptr, 611 Int4Ptr lineP, 612 BoolPtr failP, 613 Boolean ignore_web_comments 614 ); 615 616 NLM_EXTERN BioseqPtr GetBioseqReferencedByAnnot ( 617 SeqAnnotPtr sap, 618 Uint2 entityID 619 ); 620 621 /* ReadDeltaFasta reads a FASTA file, combining raw sequence and >?unk100 lines into 622 a delta Bioseq. The file pointer stops at the next FASTA with a real SeqID. */ 623 624 NLM_EXTERN BioseqPtr ReadDeltaFasta (FILE *fp, Uint2Ptr entityIDptr); 625 626 /* This function is identical to ReadDeltaFasta, except that the contents of 627 * chars_stripped will be set to TRUE if characters other than digits were stripped from 628 * the sequence, or FALSE if not. 629 */ 630 NLM_EXTERN BioseqPtr ReadDeltaFastaEx (FILE *fp, Uint2Ptr entityIDptr, BoolPtr chars_stripped); 631 NLM_EXTERN BioseqPtr ReadDeltaFastaExEx (FILE *fp, Uint2Ptr entityIDptr, BoolPtr chars_stripped, BoolPtr cache_failed); 632 633 /* ReadDeltaFastaWithEmptyDefline reads just one delta sequence with an empty 634 * definition line. 635 * Calling function should make sure that fp is set to the start of the line 636 * with the empty definition line and that there is a "gap sequence ID" 637 * present as the next definition line in the file. 638 */ 639 NLM_EXTERN BioseqPtr ReadDeltaFastaWithEmptyDefline (FILE *fp, Uint2Ptr entityIDptr, BoolPtr chars_stripped); 640 641 /* PromoteXrefs expands generef or protref feature cross-references (made by reading a 642 feature table with ReadAsnFastaOrFlatFile) to stand-alone gene features or protein features 643 and protein bioseqs. It processes ALL features in the list - you give it the FIRST sfp. */ 644 645 NLM_EXTERN void PromoteXrefs ( 646 SeqFeatPtr sfp, 647 BioseqPtr bsp, 648 Uint2 entityID 649 ); 650 NLM_EXTERN void PromoteXrefsEx ( 651 SeqFeatPtr sfp, 652 BioseqPtr bsp, 653 Uint2 entityID, 654 Boolean include_stop, 655 Boolean remove_trailingX, 656 Boolean gen_prod_set 657 ); 658 NLM_EXTERN void PromoteXrefsExEx ( 659 SeqFeatPtr sfp, 660 BioseqPtr bsp, 661 Uint2 entityID, 662 Boolean include_stop, 663 Boolean remove_trailingX, 664 Boolean gen_prod_set, 665 Boolean force_local_id, 666 BoolPtr seq_fetch_failP 667 ); 668 669 /* SetEmptyGeneticCodes imposes genetic code on all coding regions within a feature table */ 670 671 NLM_EXTERN void SetEmptyGeneticCodes (SeqAnnotPtr sap, Int2 genCode); 672 673 /* AddIntervalToLocation is a convenience function to add a single interval, and is called by 674 ReadAsnFastaOrFlatFile internally. */ 675 676 NLM_EXTERN SeqLocPtr AddIntervalToLocation (SeqLocPtr loc, SeqIdPtr sip, Int4 start, 677 Int4 stop, Boolean partial5, Boolean partial3); 678 679 /* AddQualifierToFeature applies cds product and gene qualifiers as protref or generef stored 680 as feature xrefs. Most others (e.g., protein_id) are stored as gbquals. PromoteXrefs can then 681 turn these special cases into the appropriate structures in fully expanded records. */ 682 683 NLM_EXTERN void AddQualifierToFeature (SeqFeatPtr sfp, CharPtr qual, CharPtr val); 684 685 /* specialized string trimming functions */ 686 687 NLM_EXTERN CharPtr TrimSpacesAndSemicolons (CharPtr str); 688 NLM_EXTERN CharPtr TrimSpacesAndJunkFromEnds (CharPtr str, Boolean allowEllipsis); 689 690 /* specialized cleanup for subsource and orgmod lists */ 691 NLM_EXTERN void CleanSubSourceList (SubSourcePtr PNTR sspp, Uint1 location); 692 NLM_EXTERN void CleanOrgModList (OrgModPtr PNTR ompp); 693 694 /* used by original BankIt to merge multiple primer subsources */ 695 NLM_EXTERN void CleanSubSourcePrimers (SubSourcePtr PNTR sspp); 696 697 NLM_EXTERN Boolean PubIsEffectivelyEmpty (PubdescPtr pdp); 698 699 /* extracts and reinserts descriptors in a standard order */ 700 NLM_EXTERN void NormalizeDescriptorOrder (SeqEntryPtr sep); 701 702 /* BasicSeqEntryCleanup cleans up strings, moves gbquals to the appropriate field, and 703 does several other conversions, all without changing the itemID structure (which would 704 require reindexing) */ 705 706 NLM_EXTERN void BasicSeqEntryCleanup (SeqEntryPtr sep); 707 708 /* AdvancedSeqEntryCleanup also resynchronizes CDS, mRNA, and protein partials */ 709 710 NLM_EXTERN void AdvancedSeqEntryCleanup (SeqEntryPtr sep); 711 712 /* cleanup for a single descriptor, after editing */ 713 NLM_EXTERN void CleanupStringsForOneDescriptor (SeqDescPtr sdp, SeqEntryPtr sep); 714 715 /* Selective components of BasicSeqEntryCleanup can be called for QA filtering */ 716 717 NLM_EXTERN void CleanUpSeqFeat (SeqFeatPtr sfp, Boolean isEmblOrDdbj, Boolean isJscan, Boolean stripSerial, Boolean modernizeFeats, ValNodePtr PNTR publist); 718 719 NLM_EXTERN void CleanUpSeqLoc (SeqLocPtr slp); 720 721 NLM_EXTERN Boolean FixWrongFuzzOnPlusStrand (SeqLocPtr location); 722 NLM_EXTERN Boolean FixWrongFuzzOnMinusStrand (SeqLocPtr location); 723 724 NLM_EXTERN void CleanupSubSourceOrgModOtherFeat (SeqFeatPtr sfp, Pointer userdata); 725 NLM_EXTERN void CleanupSubSourceOrgModOtherDesc (SeqDescrPtr sdp, Pointer userdata); 726 727 NLM_EXTERN void CleanUpPubdescAuthors (PubdescPtr pdp); 728 NLM_EXTERN void CleanUpPubdescBody (PubdescPtr pdp, Boolean stripSerial); 729 730 NLM_EXTERN void CleanStructuredComment (UserObjectPtr uop); 731 732 NLM_EXTERN void SortSeqEntryQualifiers (SeqEntryPtr sep); 733 734 NLM_EXTERN void CleanUpProteinTitles (SeqEntryPtr sep); 735 736 /* BasicSeqAnnotCleanup is for cleaning up contents of separate named Seq-annot objects */ 737 738 NLM_EXTERN void BasicSeqAnnotCleanup (SeqAnnotPtr sap); 739 740 NLM_EXTERN void RemoveUnnecessaryGeneXrefs (SeqFeatPtr sfp, Pointer userdata); 741 742 /* CautiousSeqEntryCleanup is a gradual consolidation and replacement of functions in SeriousSeqEntryCleanup, 743 which does change the itemID structure, and is intended to be safe for a retrofit of the ID database */ 744 745 NLM_EXTERN void CautiousSeqEntryCleanup (SeqEntryPtr sep, SeqEntryFunc taxfun, SeqEntryFunc taxmerge); 746 747 /* Convert a segmented or delta Bioseq to a raw Bioseq */ 748 749 NLM_EXTERN void SegOrDeltaBioseqToRaw (BioseqPtr bsp); 750 751 NLM_EXTERN void ConvertSegSetsToDeltaSequences (SeqEntryPtr sep); 752 753 NLM_EXTERN Boolean IsDeltaSeqWithFarpointers (BioseqPtr bsp); 754 755 /* UserFieldSort is similar to ValNodeSort but for user fields within a user object */ 756 NLM_EXTERN UserFieldPtr LIBCALL UserFieldSort (UserFieldPtr list, int (LIBCALLBACK *compar ) PROTO((VoidPtr, VoidPtr))); 757 758 /* general purpose text finite state machine */ 759 /* based on Practical Algorithms for Programmers by Binstock and Rex */ 760 761 struct TextFsa; 762 typedef struct TextFsa* TextFsaPtr; 763 764 NLM_EXTERN TextFsaPtr TextFsaNew (void); 765 NLM_EXTERN void TextFsaAdd (TextFsaPtr tbl, CharPtr word); 766 NLM_EXTERN Int4 TextFsaNext (TextFsaPtr tbl, Int4 currState, Char ch, ValNodePtr PNTR matches); 767 NLM_EXTERN TextFsaPtr TextFsaFree (TextFsaPtr tbl); 768 NLM_EXTERN Boolean TextFsaGetStats (TextFsaPtr tbl, Int4Ptr highStateP, Int4Ptr numWordsP, Int4Ptr longestWordP); 769 770 /* PCR_primer manipulation functions */ 771 772 typedef struct pcrset { 773 CharPtr fwd_seq; 774 CharPtr rev_seq; 775 CharPtr fwd_name; 776 CharPtr rev_name; 777 Int2 orig_order; 778 } PcrSet, PNTR PcrSetPtr; 779 780 NLM_EXTERN ValNodePtr ParsePCRSet (BioSourcePtr biop); 781 NLM_EXTERN ValNodePtr ParsePCRStrings ( 782 CharPtr fwd_primer_seq, 783 CharPtr rev_primer_seq, 784 CharPtr fwd_primer_name, 785 CharPtr rev_primer_name 786 ); 787 NLM_EXTERN SubSourcePtr WritePCRSet (ValNodePtr pset); 788 NLM_EXTERN ValNodePtr FreePCRSet (ValNodePtr pset); 789 790 NLM_EXTERN int LIBCALLBACK SortVnpByPCRSetSeq (VoidPtr ptr1, VoidPtr ptr2); 791 NLM_EXTERN int LIBCALLBACK SortVnpByPCRSetOrder (VoidPtr ptr1, VoidPtr ptr2); 792 793 NLM_EXTERN ValNodePtr UniqueVnpByPCRSetSeq (ValNodePtr pset); 794 795 NLM_EXTERN void ModernizePCRPrimers ( 796 BioSourcePtr biop 797 ); 798 799 /* 800 very simple explore functions - VisitOn only does one chain, VisitIn goes into set components, 801 they now return a count of the number of nodes visited, and the callback can be NULL if the purpose 802 is simply to count nodes 803 */ 804 805 typedef void (*VisitDescriptorsFunc) (SeqDescrPtr sdp, Pointer userdata); 806 NLM_EXTERN Int4 VisitDescriptorsOnBsp (BioseqPtr bsp, Pointer userdata, VisitDescriptorsFunc callback); 807 NLM_EXTERN Int4 VisitDescriptorsOnSet (BioseqSetPtr bssp, Pointer userdata, VisitDescriptorsFunc callback); 808 NLM_EXTERN Int4 VisitDescriptorsInSet (BioseqSetPtr bssp, Pointer userdata, VisitDescriptorsFunc callback); 809 NLM_EXTERN Int4 VisitDescriptorsOnSep (SeqEntryPtr sep, Pointer userdata, VisitDescriptorsFunc callback); 810 NLM_EXTERN Int4 VisitDescriptorsInSep (SeqEntryPtr sep, Pointer userdata, VisitDescriptorsFunc callback); 811 812 typedef void (*VisitFeaturesFunc) (SeqFeatPtr sfp, Pointer userdata); 813 NLM_EXTERN Int4 VisitFeaturesOnSap (SeqAnnotPtr sap, Pointer userdata, VisitFeaturesFunc callback); 814 NLM_EXTERN Int4 VisitFeaturesOnBsp (BioseqPtr bsp, Pointer userdata, VisitFeaturesFunc callback); 815 NLM_EXTERN Int4 VisitFeaturesOnSet (BioseqSetPtr bssp, Pointer userdata, VisitFeaturesFunc callback); 816 NLM_EXTERN Int4 VisitFeaturesInSet (BioseqSetPtr bssp, Pointer userdata, VisitFeaturesFunc callback); 817 NLM_EXTERN Int4 VisitFeaturesOnSep (SeqEntryPtr sep, Pointer userdata, VisitFeaturesFunc callback); 818 NLM_EXTERN Int4 VisitFeaturesInSep (SeqEntryPtr sep, Pointer userdata, VisitFeaturesFunc callback); 819 820 typedef void (*VisitAlignmentsFunc) (SeqAlignPtr sap, Pointer userdata); 821 NLM_EXTERN Int4 VisitAlignmentsOnSap (SeqAnnotPtr sap, Pointer userdata, VisitAlignmentsFunc callback); 822 NLM_EXTERN Int4 VisitAlignmentsOnBsp (BioseqPtr bsp, Pointer userdata, VisitAlignmentsFunc callback); 823 NLM_EXTERN Int4 VisitAlignmentsOnSet (BioseqSetPtr bssp, Pointer userdata, VisitAlignmentsFunc callback); 824 NLM_EXTERN Int4 VisitAlignmentsInSet (BioseqSetPtr bssp, Pointer userdata, VisitAlignmentsFunc callback); 825 NLM_EXTERN Int4 VisitAlignmentsOnSep (SeqEntryPtr sep, Pointer userdata, VisitAlignmentsFunc callback); 826 NLM_EXTERN Int4 VisitAlignmentsInSep (SeqEntryPtr sep, Pointer userdata, VisitAlignmentsFunc callback); 827 828 typedef void (*VisitGraphsFunc) (SeqGraphPtr sgp, Pointer userdata); 829 NLM_EXTERN Int4 VisitGraphsOnSap (SeqAnnotPtr sap, Pointer userdata, VisitGraphsFunc callback); 830 NLM_EXTERN Int4 VisitGraphsOnBsp (BioseqPtr bsp, Pointer userdata, VisitGraphsFunc callback); 831 NLM_EXTERN Int4 VisitGraphsOnSet (BioseqSetPtr bssp, Pointer userdata, VisitGraphsFunc callback); 832 NLM_EXTERN Int4 VisitGraphsInSet (BioseqSetPtr bssp, Pointer userdata, VisitGraphsFunc callback); 833 NLM_EXTERN Int4 VisitGraphsOnSep (SeqEntryPtr sep, Pointer userdata, VisitGraphsFunc callback); 834 NLM_EXTERN Int4 VisitGraphsInSep (SeqEntryPtr sep, Pointer userdata, VisitGraphsFunc callback); 835 836 typedef void (*VisitAnnotsFunc) (SeqAnnotPtr sap, Pointer userdata); 837 NLM_EXTERN Int4 VisitAnnotsOnBsp (BioseqPtr bsp, Pointer userdata, VisitAnnotsFunc callback); 838 NLM_EXTERN Int4 VisitAnnotsOnSet (BioseqSetPtr bssp, Pointer userdata, VisitAnnotsFunc callback); 839 NLM_EXTERN Int4 VisitAnnotsInSet (BioseqSetPtr bssp, Pointer userdata, VisitAnnotsFunc callback); 840 NLM_EXTERN Int4 VisitAnnotsOnSep (SeqEntryPtr sep, Pointer userdata, VisitAnnotsFunc callback); 841 NLM_EXTERN Int4 VisitAnnotsInSep (SeqEntryPtr sep, Pointer userdata, VisitAnnotsFunc callback); 842 843 typedef void (*VisitBioseqsFunc) (BioseqPtr bsp, Pointer userdata); 844 NLM_EXTERN Int4 VisitBioseqsInSet (BioseqSetPtr bssp, Pointer userdata, VisitBioseqsFunc callback); 845 NLM_EXTERN Int4 VisitBioseqsInSep (SeqEntryPtr sep, Pointer userdata, VisitBioseqsFunc callback); 846 847 /* VisitSequences allows you to limit visitation to nucs or prots that aren't parts, or just to parts */ 848 849 #define VISIT_MAINS 1 850 #define VISIT_NUCS 2 851 #define VISIT_PROTS 3 852 #define VISIT_PARTS 4 853 854 typedef void (*VisitSequencesFunc) (BioseqPtr bsp, Pointer userdata); 855 NLM_EXTERN Int4 VisitSequencesInSet (BioseqSetPtr bssp, Pointer userdata, Int2 filter, VisitSequencesFunc callback); 856 NLM_EXTERN Int4 VisitSequencesInSep (SeqEntryPtr sep, Pointer userdata, Int2 filter, VisitSequencesFunc callback); 857 858 typedef void (*VisitSetsFunc) (BioseqSetPtr bssp, Pointer userdata); 859 NLM_EXTERN Int4 VisitSetsInSep (SeqEntryPtr sep, Pointer userdata, VisitSetsFunc callback); 860 NLM_EXTERN Int4 VisitSetsInSet (BioseqSetPtr bssp, Pointer userdata, VisitSetsFunc callback); 861 862 /* visits components of pop/phy/mut/genbank sets, callback is at most nuc-prot set, can then call above functions */ 863 864 typedef void (*VisitElementsFunc) (SeqEntryPtr sep, Pointer userdata); 865 NLM_EXTERN Int4 VisitElementsInSep (SeqEntryPtr sep, Pointer userdata, VisitElementsFunc callback); 866 867 /* visits all SeqIds within a SeqLoc, or within features, alignments, graphs, or annots */ 868 869 typedef void (*VisitSeqIdFunc) (SeqIdPtr sip, Pointer userdata); 870 NLM_EXTERN Int4 VisitSeqIdsInSeqLoc (SeqLocPtr slp, Pointer userdata, VisitSeqIdFunc callback); 871 872 NLM_EXTERN Int4 VisitSeqIdsInBioseq (BioseqPtr bsp, Pointer userdata, VisitSeqIdFunc callback); 873 NLM_EXTERN Int4 VisitSeqIdsInSeqFeat (SeqFeatPtr sfp, Pointer userdata, VisitSeqIdFunc callback); 874 NLM_EXTERN Int4 VisitSeqIdsInSeqAlign (SeqAlignPtr sap, Pointer userdata, VisitSeqIdFunc callback); 875 NLM_EXTERN Int4 VisitSeqIdsInSeqGraph (SeqGraphPtr sgp, Pointer userdata, VisitSeqIdFunc callback); 876 NLM_EXTERN Int4 VisitSeqIdsInSeqAnnot (SeqAnnotPtr annot, Pointer userdata, VisitSeqIdFunc callback); 877 878 /* visits all sub UserFields - if the data type is 11, VisitUserFieldsInUfp recurses */ 879 880 typedef void (*VisitUserFieldsFunc) (UserFieldPtr ufp, Pointer userdata); 881 NLM_EXTERN Int4 VisitUserFieldsInUfp (UserFieldPtr ufp, Pointer userdata, VisitUserFieldsFunc callback); 882 NLM_EXTERN Int4 VisitUserFieldsInUop (UserObjectPtr uop, Pointer userdata, VisitUserFieldsFunc callback); 883 884 /* visits all sub UserObjects if the data type is 12 - needed to pack multiple user objects on a single feature. Does not visit user objects which contain other user objects. */ 885 886 typedef void (*VisitUserObjectFunc) (UserObjectPtr uop, Pointer userdata); 887 NLM_EXTERN Int4 VisitUserObjectsInUop (UserObjectPtr uop, Pointer userdata, VisitUserObjectFunc callback); 888 /* Visits all user objects, even if they contain other user objects */ 889 NLM_EXTERN Int4 VisitAllUserObjectsInUop (UserObjectPtr uop, Pointer userdata, VisitUserObjectFunc callback); 890 891 /* explores sub UserObjects including "CombinedFeatureUserObjects" and finds by label */ 892 893 NLM_EXTERN UserObjectPtr FindUopByTag (UserObjectPtr top, CharPtr tag); 894 895 /* creates "CombinedFeatureUserObjects" sfp->ext to combine two user objects */ 896 897 NLM_EXTERN UserObjectPtr CombineUserObjects (UserObjectPtr origuop, UserObjectPtr newuop); 898 899 /* visits all publication descriptors or features */ 900 901 typedef void (*VisitPubdescsFunc) (PubdescPtr pdp, Pointer userdata); 902 NLM_EXTERN Int4 VisitPubdescsOnBsp (BioseqPtr bsp, Pointer userdata, VisitPubdescsFunc callback); 903 NLM_EXTERN Int4 VisitPubdescsOnSet (BioseqSetPtr bssp, Pointer userdata, VisitPubdescsFunc callback); 904 NLM_EXTERN Int4 VisitPubdescsInSet (BioseqSetPtr bssp, Pointer userdata, VisitPubdescsFunc callback); 905 NLM_EXTERN Int4 VisitPubdescsOnSep (SeqEntryPtr sep, Pointer userdata, VisitPubdescsFunc callback); 906 NLM_EXTERN Int4 VisitPubdescsInSep (SeqEntryPtr sep, Pointer userdata, VisitPubdescsFunc callback); 907 908 /* visits all authors in a publication */ 909 910 typedef void (*VisitAuthorFunc) (NameStdPtr nsp, Pointer userdata); 911 NLM_EXTERN Int4 VisitAuthorsInPub (PubdescPtr pdp, Pointer userdata, VisitAuthorFunc callback); 912 913 /* visits all biosource descriptors or features */ 914 915 typedef void (*VisitBioSourcesFunc) (BioSourcePtr biop, Pointer userdata); 916 NLM_EXTERN Int4 VisitBioSourcesOnBsp (BioseqPtr bsp, Pointer userdata, VisitBioSourcesFunc callback); 917 NLM_EXTERN Int4 VisitBioSourcesOnSet (BioseqSetPtr bssp, Pointer userdata, VisitBioSourcesFunc callback); 918 NLM_EXTERN Int4 VisitBioSourcesInSet (BioseqSetPtr bssp, Pointer userdata, VisitBioSourcesFunc callback); 919 NLM_EXTERN Int4 VisitBioSourcesOnSep (SeqEntryPtr sep, Pointer userdata, VisitBioSourcesFunc callback); 920 NLM_EXTERN Int4 VisitBioSourcesInSep (SeqEntryPtr sep, Pointer userdata, VisitBioSourcesFunc callback); 921 922 /* function to scan binary ASN.1 file of entire release as Bioseq-set, simple explore from successive top seps */ 923 /* compressed can be TRUE only on UNIX, where it does a popen on zcat to decompress on-the-fly */ 924 /* although it now returns a count of components visited, the callback cannot be NULL for this function */ 925 926 typedef void (*ScanBioseqSetFunc) (SeqEntryPtr sep, Pointer userdata); 927 NLM_EXTERN Int4 ScanBioseqSetRelease ( 928 CharPtr inputFile, 929 Boolean binary, 930 Boolean compressed, 931 Pointer userdata, 932 ScanBioseqSetFunc callback 933 ); 934 935 /* multi-thread safe version does not free SeqEntryPtr after calling callback, use FreeScanSeqEntryMT */ 936 NLM_EXTERN Int4 ScanBioseqSetReleaseMT ( 937 CharPtr inputFile, 938 Boolean binary, 939 Boolean compressed, 940 Pointer userdata, 941 ScanBioseqSetFunc callback 942 ); 943 NLM_EXTERN SeqEntryPtr LIBCALL FreeScanSeqEntryMT ( 944 SeqEntryPtr sep 945 ); 946 947 /* More automatic version of ReadAsnFastaOrFlatFile, can read BioseqSet release file */ 948 949 NLM_EXTERN Int4 ReadSequenceAsnFile ( 950 CharPtr inputFile, 951 Boolean binary, 952 Boolean compressed, 953 Pointer userdata, 954 ScanBioseqSetFunc callback 955 ); 956 957 /* function to scan binary ASN.1 file of entrezgene release as Entrezgene-Set */ 958 959 typedef void (*ScanEntrezgeneSetFunc) (EntrezgenePtr egp, Pointer userdata); 960 NLM_EXTERN Int4 ScanEntrezgeneSetRelease ( 961 CharPtr inputFile, 962 Boolean binary, 963 Boolean compressed, 964 Pointer userdata, 965 ScanEntrezgeneSetFunc callback 966 ); 967 968 /* PubMed registered fetch functionality */ 969 970 NLM_EXTERN PubmedEntryPtr LIBCALL GetPubMedForUid (Int4 uid); 971 972 /* internal support type, registration function */ 973 974 typedef PubmedEntryPtr (LIBCALLBACK * PubMedFetchFunc) (Int4 uid); 975 976 NLM_EXTERN void LIBCALL PubMedSetFetchFunc (PubMedFetchFunc func); 977 978 NLM_EXTERN void FirstNameToInitials (CharPtr first, CharPtr inits, size_t maxsize); 979 980 extern CharPtr MyFGetLine (FILE *fp, ValNodePtr PNTR current_data); 981 982 #if defined (WIN32) 983 extern char * __stdcall AbstractReadFunction (Pointer userdata); 984 extern void __stdcall AbstractReportError (TErrorInfoPtr err_ptr, Pointer userdata); 985 #else 986 extern char * AbstractReadFunction (Pointer userdata); 987 extern void AbstractReportError (TErrorInfoPtr err_ptr, Pointer userdata); 988 #endif 989 990 typedef struct readbuffer { 991 FILE *fp; 992 ValNodePtr current_data; 993 } ReadBufferData, PNTR ReadBufferPtr; 994 995 extern void FreeBufferedReadList (ValNodePtr vnp); 996 997 extern CharPtr AlignmentStringToSequenceString (CharPtr aln_str, Uint1 moltype); 998 extern SeqEntryPtr MakeSequinDataFromAlignment (TAlignmentFilePtr afp, Uint1 moltype); 999 extern SeqEntryPtr MakeSequinDataFromAlignmentEx (TAlignmentFilePtr afp, Uint1 moltype, Boolean check_ids); 1000 extern SeqEntryPtr make_seqentry_for_seqentry (SeqEntryPtr sep); 1001 extern Boolean ConvertOnePseudoCDSToMiscFeat (SeqFeatPtr sfp); 1002 NLM_EXTERN Boolean ConvertOnePseudoCDSToMiscFeatEx (SeqFeatPtr sfp, Boolean remove_product); 1003 extern void ConvertPseudoCDSToMiscFeatsForEntityID (Uint2 entityID); 1004 1005 extern SeqAlignPtr FindAlignmentsForBioseq (BioseqPtr bsp); 1006 extern ValNodePtr FindAlignSeqAnnotsForBioseq (BioseqPtr bsp); 1007 extern Boolean IsSequenceFirstInPairwise (SeqEntryPtr sep, SeqIdPtr sip); 1008 extern SeqAlignPtr RemoveOneSequenceFromAlignment (SeqIdPtr sip, SeqAlignPtr salphead); 1009 extern Boolean RemoveSequenceFromAlignments (SeqEntryPtr sep, SeqIdPtr sip); 1010 extern BioseqPtr ReadFastaOnly (FILE *fp, 1011 Boolean forceNuc, Boolean forceProt, 1012 BoolPtr chars_stripped, 1013 CharPtr lastchar); 1014 extern void MergeFeatureIntervalsToParts (SeqFeatPtr sfp, Boolean ordered); 1015 1016 extern void ExtendSingleGeneOnMRNA (BioseqPtr bsp, Pointer userdata); 1017 1018 typedef struct loginfo 1019 { 1020 FILE *fp; 1021 Boolean data_in_log; 1022 CharPtr display_title; 1023 Char path[PATH_MAX]; 1024 } LogInfoData, PNTR LogInfoPtr; 1025 1026 extern LogInfoPtr OpenLog (CharPtr display_title); 1027 extern LogInfoPtr FreeLog (LogInfoPtr lip); 1028 1029 NLM_EXTERN void FixNonWGSSets (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1030 1031 /* structures and functions for the Discrepancy Report */ 1032 typedef void (*ClickableCallback) (ValNodePtr item_list, Pointer userdata); 1033 typedef void (*ClickableCallbackDataFree) (Pointer userdata); 1034 typedef void (*AutofixCallback) (ValNodePtr item_list, Pointer userdata, LogInfoPtr lip); 1035 1036 typedef struct clickableitem 1037 { 1038 Uint4 clickable_item_type; 1039 CharPtr description; 1040 ValNodePtr item_list; 1041 ClickableCallback callback_func; 1042 ClickableCallbackDataFree datafree_func; 1043 Pointer callback_data; 1044 Boolean chosen; 1045 ValNodePtr subcategories; 1046 Boolean expanded; 1047 Int4 level; 1048 AutofixCallback autofix_func; /* note - autofix functions can be set for an 1049 * entire category or for an individual clickable 1050 * item. Don't set autofix functions in both 1051 * places or they will both be called. 1052 */ 1053 Pointer autofix_data; /* data for item-specific autofixes */ 1054 } ClickableItemData, PNTR ClickableItemPtr; 1055 1056 extern ClickableItemPtr 1057 NewClickableItem 1058 (Uint4 clickable_item_type, 1059 CharPtr description_fmt, 1060 ValNodePtr item_list); 1061 1062 extern ClickableItemPtr 1063 NewClickableItemNoList 1064 (Uint4 clickable_item_type, 1065 CharPtr description); 1066 1067 extern ValNodePtr ClickableItemObjectListFree (ValNodePtr vnp); 1068 extern ValNodePtr ClickableItemObjectListCopy (ValNodePtr orig); 1069 extern ClickableItemPtr ClickableItemFree (ClickableItemPtr cip); 1070 extern ValNodePtr FreeClickableList (ValNodePtr list); 1071 extern Boolean AnyDiscrepanciesChosen (ValNodePtr cip_list); 1072 NLM_EXTERN void ChooseAllDiscrepancies (ValNodePtr cip_list); 1073 1074 extern int LIBCALLBACK SortVnpByClickableItemDescription (VoidPtr ptr1, VoidPtr ptr2); 1075 NLM_EXTERN int LIBCALLBACK SortVnpByClickableItemChosen (VoidPtr ptr1, VoidPtr ptr2); 1076 1077 extern void ExpandClickableItemList (ValNodePtr vnp); 1078 extern void ContractClickableItemList (ValNodePtr vnp); 1079 1080 NLM_EXTERN void RemoveDuplicateItems (ValNodePtr PNTR item_list); 1081 1082 /* To add a new type of test, do ALL Of the following: 1083 * 1. add an item to the DiscrepancyType enum (this will fill the clickable_item_type value) 1084 * 2. add a collection function and declare it with the others 1085 * 3. add an item to discrepancy_info_list that corresponds with the position of the 1086 * new enum value. If you are combining multiple types in one collection function, 1087 * be sure to list them together. 1088 */ 1089 1090 /* SHOW_TRANSL_EXCEPT added by J. Chen */ 1091 /* SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME added, J. Chen */ 1092 /* TEST_DEFLINE_EXIST added, J. Chen */ 1093 typedef enum { 1094 DISC_GENE_MISSING = 0, 1095 DISC_SUPERFLUOUS_GENE, 1096 DISC_GENE_MISSING_LOCUS_TAG, 1097 DISC_GENE_DUPLICATE_LOCUS_TAG, 1098 DISC_GENE_LOCUS_TAG_BAD_FORMAT, 1099 DISC_GENE_LOCUS_TAG_INCONSISTENT_PREFIX, 1100 DISC_NON_GENE_LOCUS_TAG, 1101 DISC_COUNT_NUCLEOTIDES, 1102 DISC_MISSING_PROTEIN_ID, 1103 DISC_INCONSISTENT_PROTEIN_ID_PREFIX, 1104 DISC_GENE_CDS_mRNA_LOCATION_CONFLICT, 1105 DISC_GENE_PRODUCT_CONFLICT, 1106 DISC_GENE_DUPLICATE_LOCUS, 1107 DISC_EC_NUMBER_NOTE, 1108 DISC_PSEUDO_MISMATCH, 1109 DISC_JOINED_FEATURES, 1110 DISC_OVERLAPPING_GENES, 1111 DISC_OVERLAPPING_CDS, 1112 DISC_CONTAINED_CDS, 1113 DISC_RNA_CDS_OVERLAP, 1114 DISC_SHORT_CONTIG, 1115 DISC_INCONSISTENT_BIOSRC, 1116 DISC_SUSPECT_PRODUCT_NAME, 1117 DISC_PRODUCT_NAME_TYPO, 1118 DISC_PRODUCT_NAME_QUICKFIX, 1119 DISC_INCONSISTENT_BIOSRC_DEFLINE, 1120 DISC_PARTIAL_CDS_IN_COMPLETE_SEQUENCE, 1121 DISC_EC_NUMBER_ON_HYPOTHETICAL_PROTEIN, 1122 DISC_NO_TAXLOOKUP, 1123 DISC_BAD_TAXLOOKUP, 1124 DISC_SHORT_SEQUENCE, 1125 DISC_SUSPECT_PHRASES, 1126 DISC_SUSPICIOUS_NOTE_TEXT, 1127 DISC_COUNT_TRNA, 1128 DISC_DUP_TRNA, 1129 DISC_BADLEN_TRNA, 1130 DISC_STRAND_TRNA, 1131 DISC_COUNT_RRNA, 1132 DISC_DUP_RRNA, 1133 DISC_RNA_NO_PRODUCT, 1134 DISC_TRANSL_NO_NOTE, 1135 DISC_NOTE_NO_TRANSL, 1136 DISC_TRANSL_TOO_LONG, 1137 DISC_CDS_OVERLAP_TRNA, 1138 DISC_COUNT_PROTEINS, 1139 DISC_FEAT_OVERLAP_SRCFEAT, 1140 DISC_MISSING_GENPRODSET_PROTEIN, 1141 DISC_DUP_GENPRODSET_PROTEIN, 1142 DISC_MISSING_GENPRODSET_TRANSCRIPT_ID, 1143 DISC_DUP_GENPRODSET_TRANSCRIPT_ID, 1144 DISC_PERCENTN, 1145 DISC_N_RUNS, 1146 DISC_ZERO_BASECOUNT, 1147 DISC_ADJACENT_PSEUDOGENE, 1148 DISC_LONG_NO_ANNOTATION, 1149 DISC_NO_ANNOTATION, 1150 DISC_INFLUENZA_DATE_MISMATCH, 1151 DISC_SHORT_INTRON, 1152 DISC_MISSING_VIRAL_QUALS, 1153 DISC_SRC_QUAL_PROBLEM, 1154 DISC_MISSING_SRC_QUAL, 1155 DISC_DUP_SRC_QUAL, 1156 DISC_DUP_SRC_QUAL_DATA, 1157 DISC_HAPLOTYPE_MISMATCH, 1158 DISC_FEATURE_MOLTYPE_MISMATCH, 1159 DISC_CDS_WITHOUT_MRNA, 1160 DISC_EXON_INTRON_CONFLICT, 1161 DISC_FEATURE_COUNT, 1162 DISC_SPECVOUCHER_TAXNAME_MISMATCH, 1163 DISC_GENE_PARTIAL_CONFLICT, 1164 DISC_FLATFILE_FIND_ONCALLER, 1165 DISC_FLATFILE_FIND_ONCALLER_FIXABLE, 1166 DISC_FLATFILE_FIND_ONCALLER_UNFIXABLE, 1167 DISC_CDS_PRODUCT_FIND, 1168 DISC_DUP_DEFLINE, 1169 DUP_DISC_ATCC_CULTURE_CONFLICT, 1170 DISC_USA_STATE, 1171 DISC_INCONSISTENT_MOLTYPES, 1172 DISC_SUBMITBLOCK_CONFLICT, 1173 DISC_POSSIBLE_LINKER, 1174 DISC_TITLE_AUTHOR_CONFLICT, 1175 DISC_BAD_GENE_STRAND, 1176 DISC_MAP_CHROMOSOME_CONFLICT, 1177 DISC_RBS_WITHOUT_GENE, 1178 DISC_CITSUBAFFIL_CONFLICT, 1179 DISC_REQUIRED_CLONE, 1180 DISC_SOURCE_QUALS_ASNDISC, 1181 DISC_mRNA_ON_WRONG_SEQUENCE_TYPE, 1182 DISC_RETROVIRIDAE_DNA, 1183 DISC_CHECK_AUTH_CAPS, 1184 DISC_CHECK_RNA_PRODUCTS_AND_COMMENTS, 1185 DISC_MICROSATELLITE_REPEAT_TYPE, 1186 DISC_MITOCHONDRION_REQUIRED, 1187 DISC_UNPUB_PUB_WITHOUT_TITLE, 1188 DISC_QUALITY_SCORES, 1189 DISC_INTERNAL_TRANSCRIBED_SPACER_RRNA, 1190 DISC_PARTIAL_PROBLEMS, 1191 DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS, 1192 DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_EXCEPTION, 1193 DISC_SUSPECT_RRNA_PRODUCTS, 1194 DISC_SUSPECT_MISC_FEATURES, 1195 DISC_BACTERIA_MISSING_STRAIN, 1196 DISC_MISSING_DEFLINES, 1197 DISC_MISSING_AFFIL, 1198 DISC_BACTERIA_SHOULD_NOT_HAVE_ISOLATE, 1199 DISC_BACTERIA_SHOULD_NOT_HAVE_MRNA, 1200 DISC_CDS_HAS_NEW_EXCEPTION, 1201 DISC_TRINOMIAL_SHOULD_HAVE_QUALIFIER, 1202 DISC_METAGENOMIC, 1203 DISC_METAGENOME_SOURCE, 1204 ONCALLER_GENE_MISSING, 1205 ONCALLER_SUPERFLUOUS_GENE, 1206 DISC_SHORT_RRNA, 1207 ONCALLER_CHECK_AUTHORITY, 1208 ONCALLER_CONSORTIUM, 1209 ONCALLER_STRAIN_CULTURE_COLLECTION_MISMATCH, 1210 ONCALLER_MULTISRC, 1211 ONCALLER_MULTIPLE_CULTURE_COLLECTION, 1212 DISC_SEGSETS_PRESENT, 1213 DISC_NONWGS_SETS_PRESENT, 1214 DISC_FEATURE_LIST, 1215 DISC_CATEGORY_HEADER, 1216 DISC_MISMATCHED_COMMENTS, 1217 DISC_STRAIN_TAXNAME_MISMATCH, 1218 DISC_HUMAN_HOST, 1219 DISC_BAD_BACTERIAL_GENE_NAME, 1220 TEST_BAD_GENE_NAME, 1221 ONCALLER_ORDERED_LOCATION, 1222 ONCALLER_COMMENT_PRESENT, 1223 ONCALLER_DEFLINE_ON_SET, 1224 ONCALLER_HIV_RNA_INCONSISTENT, 1225 SHORT_PROT_SEQUENCES, 1226 TEST_EXON_ON_MRNA, 1227 TEST_HAS_PROJECT_ID, 1228 ONCALLER_HAS_STANDARD_NAME, 1229 ONCALLER_MISSING_STRUCTURED_COMMENTS, 1230 DISC_REQUIRED_STRAIN, 1231 MISSING_GENOMEASSEMBLY_COMMENTS, 1232 DISC_BACTERIAL_TAX_STRAIN_MISMATCH, 1233 TEST_CDS_HAS_CDD_XREF, 1234 TEST_UNUSUAL_NT, 1235 TEST_LOW_QUALITY_REGION, 1236 TEST_ORGANELLE_NOT_GENOMIC, 1237 TEST_UNWANTED_SPACER, 1238 TEST_ORGANELLE_PRODUCTS, 1239 TEST_SP_NOT_UNCULTURED, 1240 TEST_BAD_MRNA_QUAL, 1241 TEST_UNNECESSARY_ENVIRONMENTAL, 1242 TEST_UNNECESSARY_VIRUS_GENE, 1243 TEST_UNWANTED_SET_WRAPPER, 1244 TEST_MISSING_PRIMER, 1245 TEST_UNUSUAL_MISC_RNA, 1246 TEST_AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE, 1247 TEST_DUP_GENES_OPPOSITE_STRANDS, 1248 TEST_SMALL_GENOME_SET_PROBLEM, 1249 TEST_OVERLAPPING_RRNAS, 1250 TEST_MRNA_SEQUENCE_MINUS_STRAND_FEATURES, 1251 TEST_TAXNAME_NOT_IN_DEFLINE, 1252 TEST_COUNT_UNVERIFIED, 1253 SHOW_TRANSL_EXCEPT, 1254 SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME, 1255 TEST_DEFLINE_PRESENT, 1256 TEST_MRNA_OVERLAPPING_PSEUDO_GENE, 1257 FIND_OVERLAPPED_GENES, 1258 DISC_BIOMATERIAL_TAXNAME_MISMATCH, 1259 DISC_CULTURE_TAXNAME_MISMATCH, 1260 DISC_CHECK_AUTH_NAME, 1261 NON_RETROVIRIDAE_PROVIRAL, 1262 RNA_PROVIRAL, 1263 SHORT_SEQUENCES_200, 1264 DISC_10_PERCENTN, 1265 N_RUNS_14, 1266 MOLTYPE_NOT_MRNA, 1267 TECHNIQUE_NOT_TSA, 1268 MISSING_STRUCTURED_COMMENT, 1269 MISSING_PROJECT, 1270 MULTIPLE_CDS_ON_MRNA, 1271 DUP_DISC_CBS_CULTURE_CONFLICT, 1272 DIVISION_CODE_CONFLICTS, 1273 RRNA_NAME_CONFLICTS, 1274 EUKARYOTE_SHOULD_HAVE_MRNA, 1275 MRNA_SHOULD_HAVE_PROTEIN_TRANSCRIPT_IDS, 1276 ONCALLER_COUNTRY_COLON, 1277 ONCALLER_BIOPROJECT_ID, 1278 ONCALLER_STRAIN_TAXNAME_CONFLICT, 1279 ONCALLER_MORE_NAMES_COLLECTED_BY, 1280 ONCALLER_MORE_OR_SPEC_NAMES_IDENTIFIED_BY, 1281 ONCALLER_SUSPECTED_ORG_IDENTIFIED, 1282 ONCALLER_SUSPECTED_ORG_COLLECTED, 1283 ONCALLER_SWITCH_STRUCTURED_COMMENT_PREFIX, 1284 ONCALLER_CITSUB_AFFIL_DUP_TEXT, 1285 ONCALLER_DUPLICATE_PRIMER_SET, 1286 END_COLON_IN_COUNTRY, 1287 DISC_PROTEIN_NAMES, 1288 DISC_TITLE_ENDS_WITH_SEQUENCE, 1289 DISC_INCONSISTENT_STRUCTURED_COMMENTS, 1290 DISC_INCONSISTENT_DBLINK, 1291 DISC_INCONSISTENT_MOLINFO_TECH, 1292 DISC_GAPS, 1293 DISC_BAD_BGPIPE_QUALS, 1294 TEST_SHORT_LNCRNA, 1295 TEST_TERMINAL_NS, 1296 TEST_ALIGNMENT_HAS_SCORE, 1297 UNCULTURED_NOTES_ONCALLER, 1298 SEQ_ID_PHRASES, 1299 NO_PRODUCT_STRING, 1300 MAX_DISC_TYPE 1301 } DiscrepancyType; 1302 1303 typedef enum { 1304 eReportTypeDiscrepancy = 1, 1305 eReportTypeOnCaller, 1306 eReportTypeMegaReport, 1307 eReportTypeTSA, 1308 eReportType_End 1309 } EDiscrepancyReportType; 1310 1311 extern Boolean IsTestTypeAppropriateForReportType (Int4 test_type, EDiscrepancyReportType report_type); 1312 1313 extern void PrintDiscrepancyTestList (FILE *fp); 1314 1315 extern void SetDiscrepancyLevels (ValNodePtr discrepancy_list, Int4 level); 1316 1317 extern CharPtr GetDiscrepancyTestConfName (DiscrepancyType dtype); 1318 extern CharPtr GetDiscrepancyTestSettingName (DiscrepancyType dtype); 1319 extern DiscrepancyType GetDiscrepancyTypeFromSettingName (CharPtr setting_name); 1320 extern Boolean DiscrepancyTestHasAutofix (DiscrepancyType dtype); 1321 1322 typedef struct discrepancyconfig 1323 { 1324 Boolean conf_list[MAX_DISC_TYPE]; 1325 Boolean use_feature_table_format; 1326 Boolean use_big_test_set; 1327 Boolean is_big_sequence; 1328 } DiscrepancyConfigData, PNTR DiscrepancyConfigPtr; 1329 1330 extern DiscrepancyConfigPtr DiscrepancyConfigFree (DiscrepancyConfigPtr dcp); 1331 extern DiscrepancyConfigPtr DiscrepancyConfigNew (void); 1332 extern DiscrepancyConfigPtr DiscrepancyConfigCopy (DiscrepancyConfigPtr dcp); 1333 extern DiscrepancyConfigPtr ReadDiscrepancyConfig (void); 1334 extern DiscrepancyConfigPtr ReadDiscrepancyConfigEx (CharPtr report_config_name); 1335 extern void SaveDiscrepancyConfig (DiscrepancyConfigPtr dcp); 1336 extern void SaveDiscrepancyConfigEx (DiscrepancyConfigPtr dcp, CharPtr report_name); 1337 extern void DisableTRNATests (DiscrepancyConfigPtr dcp); 1338 extern CharPtr SetDiscrepancyReportTestsFromString (CharPtr list, Boolean enable, DiscrepancyConfigPtr dcp); 1339 extern void ConfigureForBigSequence (DiscrepancyConfigPtr dcp); 1340 extern void ConfigureForGenomes (DiscrepancyConfigPtr dcp); 1341 extern void ConfigureForReportType (DiscrepancyConfigPtr dcp, EDiscrepancyReportType report_type); 1342 1343 typedef void (*PerformDiscrepancyTest) PROTO ((ValNodePtr PNTR, ValNodePtr)); 1344 1345 extern ValNodePtr CollectDiscrepancies (DiscrepancyConfigPtr dcp, ValNodePtr sep_list, PerformDiscrepancyTest taxlookup); 1346 extern void AutofixDiscrepancies (ValNodePtr vnp, Boolean fix_all, LogInfoPtr lip); 1347 extern void ChooseFixableDiscrepancies (ValNodePtr vnp); 1348 extern CharPtr GetDiscrepancyItemText (ValNodePtr vnp); 1349 extern CharPtr GetDiscrepancyItemTextEx (ValNodePtr vnp, CharPtr filename); 1350 extern void VisitGenProdSetFeatures (SeqEntryPtr sep, Pointer userdata, VisitFeaturesFunc callback); 1351 extern ValNodePtr ReplaceDiscrepancyItemWithFeatureTableStrings (ValNodePtr feat_list); 1352 extern CharPtr GetParentLabelForDiscrepancyItem (ValNodePtr vnp); 1353 extern void WriteDiscrepancy (FILE *fp, ClickableItemPtr dip, Boolean use_feature_table_fmt); 1354 extern void WriteDiscrepancyEx (FILE *fp, ClickableItemPtr dip, Boolean use_feature_table_fmt, Boolean cmd_line, CharPtr descr_prefix, Boolean list_features_if_subcat); 1355 extern int LIBCALLBACK SortVnpByDiscrepancyDescription (VoidPtr ptr1, VoidPtr ptr2); 1356 extern int LIBCALLBACK SortVnpByDiscrepancyItemText (VoidPtr ptr1, VoidPtr ptr2); 1357 extern void ValNodeReverse (ValNodePtr PNTR list); 1358 1359 /* Individual discrepancy test function declarations */ 1360 extern const CharPtr kOverlappingCDSNoteText; 1361 extern const CharPtr kOverlappingCDSNeedsNoteFmt; 1362 extern void AddOverlappingCodingRegionDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list); 1363 extern void AddDiscrepanciesForMissingOrNonUniqueGeneLocusTagsEx (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list, Boolean exclude_dirsub); 1364 extern void AddDiscrepanciesForMissingOrNonUniqueGeneLocusTags (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list); 1365 extern void FindShortIntronsEx (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list, Boolean check_organelles); 1366 extern void FindShortIntrons (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list); 1367 extern void CheckBioSourceQuals (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list); 1368 extern void FindExtendablePartials (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list); 1369 extern void FindBacterialNonExtendablePartials (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list); 1370 NLM_EXTERN void FindMismatchedComments (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list); 1371 1372 /* autofix functions */ 1373 NLM_EXTERN void MarkOverlappingCDSs (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1374 NLM_EXTERN void FixBacterialNonExtendablePartials (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1375 NLM_EXTERN void FixExtendablePartials (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1376 NLM_EXTERN void FixMismatchedComments (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1377 NLM_EXTERN void FixHumanHosts (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1378 NLM_EXTERN void FixOrderedLocations (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1379 NLM_EXTERN void OncallerToolPseudoDiscrepanciesFix (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1380 NLM_EXTERN void OncallerToolFindEcoNoEnvFix (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1381 NLM_EXTERN void AddExceptionsToShortIntrons (ValNodePtr item_list, Pointer data, LogInfoPtr lip); 1382 1383 NLM_EXTERN Boolean IsShortrRNA (SeqFeatPtr sfp); 1384 1385 /* structure shared by tbl2asn and discrepancy report functions */ 1386 typedef struct genprodsetdiscrepancylists { 1387 ValNodePtr cds_product_list; 1388 ValNodePtr mrna_product_list; 1389 ValNodePtr missing_mrna_product; 1390 ValNodePtr missing_protein_id; 1391 } GenProdSetDiscrepancyListsData, PNTR GenProdSetDiscrepancyListsPtr; 1392 1393 extern void CheckGenProdSetsInSeqEntry (SeqEntryPtr sep, GenProdSetDiscrepancyListsPtr lists); 1394 1395 1396 typedef struct protidlists { 1397 ValNodePtr missing_gnl_list; 1398 ValNodePtr gnl_list; 1399 } ProtIdListsData, PNTR ProtIdListsPtr; 1400 1401 /* structure shared by tbl2asn and discrepancy report functions */ 1402 typedef struct globaldiscrepancy { 1403 CharPtr str; 1404 Uint1 data_choice; 1405 Pointer data; 1406 } GlobalDiscrepancyData, PNTR GlobalDiscrepancyPtr; 1407 1408 extern GlobalDiscrepancyPtr GlobalDiscrepancyNew (CharPtr str, Uint1 data_choice, Pointer data); 1409 extern GlobalDiscrepancyPtr GlobalDiscrepancyFree (GlobalDiscrepancyPtr g); 1410 extern ValNodePtr FreeGlobalDiscrepancyList (ValNodePtr vnp); 1411 extern void ConvertGlobalDiscrepancyToText (GlobalDiscrepancyPtr g, Boolean use_feature_fmt, CharPtr filename); 1412 extern void ConvertGlobalDiscrepancyListToText (ValNodePtr vnp, Boolean use_feature_fmt, CharPtr filename); 1413 extern ValNodePtr GetGlobalDiscrepancyItem (GlobalDiscrepancyPtr g); 1414 extern CharPtr GetGlobalDiscrepancyStr (GlobalDiscrepancyPtr g); 1415 NLM_EXTERN int LIBCALLBACK SortVnpByGlobalDiscrepancyString (VoidPtr ptr1, VoidPtr ptr2); 1416 NLM_EXTERN int LIBCALLBACK SortVnpByGlobalDiscrepancyStringCaseSensitive (VoidPtr ptr1, VoidPtr ptr2); 1417 extern ClickableItemPtr 1418 ReportNonUniqueGlobalDiscrepancy 1419 (ValNodePtr vnp, 1420 CharPtr label_fmt, 1421 CharPtr ind_cat_fmt, 1422 Uint4 clickable_item_type, 1423 Boolean keep_top_category); 1424 extern ValNodePtr ReportInconsistentGlobalDiscrepancyPrefixes 1425 (ValNodePtr vnp, 1426 CharPtr label_fmt, 1427 Uint4 clickable_item_type); 1428 extern ValNodePtr ReportInconsistentGlobalDiscrepancyStrings 1429 (ValNodePtr vnp, 1430 CharPtr label_fmt, 1431 Uint4 clickable_item_type); 1432 extern ClickableItemPtr ReportMissingFields (ValNodePtr list, CharPtr label_fmt, Uint4 clickable_item_type); 1433 extern ClickableItemPtr ReportBadLocusTagFormat (ValNodePtr list); 1434 extern ClickableItemPtr FindAdjacentDuplicateLocusTagGenes (ValNodePtr locus_tag_list); 1435 extern void FindProteinIDCallback (BioseqPtr bsp, Pointer userdata); 1436 1437 1438 /* formats for global discrepancies also used by tbl2asn */ 1439 extern CharPtr discReportDuplicateLocusTagFmt; 1440 extern CharPtr discReportOneDuplicateLocusTagFmt; 1441 extern CharPtr discReportDuplicateProteinIDFmt; 1442 extern CharPtr discReportOneDuplicateProteinIDFmt; 1443 extern CharPtr discReportDuplicateTranscriptIdFmt; 1444 extern CharPtr discReportOneDuplicateTranscriptIdFmt; 1445 extern CharPtr discReportInconsistentLocusTagPrefixFmt; 1446 extern CharPtr discReportMissingLocusTags; 1447 extern CharPtr discReportInconsistentProteinIDPrefixFmt; 1448 extern CharPtr discReportBadProteinIdFmt; 1449 extern CharPtr discReportMissingTranscriptIDFmt; 1450 1451 extern CharPtr GetBioseqLabel (BioseqPtr bsp); 1452 extern CharPtr GetBioseqSetLabel (BioseqSetPtr bssp); 1453 1454 NLM_EXTERN ValNodePtr ValNodeDupStringList (ValNodePtr vnp); 1455 NLM_EXTERN ValNodePtr ValNodeDupIntList (ValNodePtr vnp); 1456 1457 typedef enum { 1458 eLocusTagErrorBadFormat, 1459 eLocusTagErrorDuplicate, 1460 eLocusTagErrorInconsistentPrefix 1461 } ELocusTagError; 1462 1463 NLM_EXTERN ValNodePtr FindBadLocusTagsInList (ValNodePtr list); 1464 1465 1466 typedef struct discreportoutputconfig { 1467 Boolean use_feature_table_format; 1468 Boolean expand_report_categories[MAX_DISC_TYPE]; 1469 Boolean summary_report; 1470 Boolean add_output_tag; 1471 Boolean add_extra_output_tag; 1472 Int4 num_nucs; 1473 } DiscReportOutputConfigData, PNTR DiscReportOutputConfigPtr; 1474 1475 NLM_EXTERN void AddToOutputConfig(SeqEntryPtr sep, DiscReportOutputConfigPtr c); 1476 NLM_EXTERN void AddListToOutputConfig(ValNodePtr list, DiscReportOutputConfigPtr c); 1477 1478 typedef struct globaldiscrepreport { 1479 ValNodeBlock locus_tag_list; 1480 ValNodeBlock missing_locus_tag; 1481 ValNodeBlock cds_product_list; 1482 ValNodeBlock missing_cds_product; 1483 ValNodeBlock mrna_product_list; 1484 ValNodeBlock missing_mrna_product; 1485 ValNodePtr adjacent_locus_tag_disc_list; 1486 ValNodeBlock missing_gnl_list; 1487 ValNodeBlock gnl_list; 1488 ValNodePtr global_src_qual_vals; 1489 ValNodePtr global_srcs; 1490 ValNodeBlock global_prot_name_list; 1491 1492 ValNodePtr src_qual_repeated_list; 1493 ValNodePtr src_qual_multi_list; 1494 ValNodeBlock feature_count_list; 1495 ValNodeBlock discrepancy_list; 1496 1497 PerformDiscrepancyTest taxlookup; 1498 DiscrepancyConfigPtr test_config; 1499 DiscReportOutputConfigPtr output_config; 1500 } GlobalDiscrepReportData, PNTR GlobalDiscrepReportPtr; 1501 1502 NLM_EXTERN GlobalDiscrepReportPtr GlobalDiscrepReportNew (); 1503 NLM_EXTERN GlobalDiscrepReportPtr GlobalDiscrepReportFree (GlobalDiscrepReportPtr g); 1504 NLM_EXTERN void AddSeqEntryToGlobalDiscrepReport (SeqEntryPtr sep, GlobalDiscrepReportPtr g, CharPtr filename); 1505 NLM_EXTERN Boolean WriteGlobalDiscrepancyReportEx (GlobalDiscrepReportPtr g, FILE *fp, CharPtr extra_comment); 1506 NLM_EXTERN void WriteGlobalDiscrepancyReport (GlobalDiscrepReportPtr g, FILE *fp); 1507 1508 NLM_EXTERN Boolean CollectionDateIsInTheFuture (CharPtr name); 1509 NLM_EXTERN Boolean CollectionDateIsValid (CharPtr name); 1510 NLM_EXTERN Boolean CollectionDatesInOrder (CharPtr name); 1511 1512 /* for the Barcode Discrepancy Test */ 1513 typedef enum { 1514 eBarcodeTest_Length = 0, 1515 eBarcodeTest_Primers, 1516 eBarcodeTest_Country, 1517 eBarcodeTest_SpecimenVoucher, 1518 eBarcodeTest_PercentN, 1519 eBarcodeTest_CollectionDate, 1520 eBarcodeTest_OrderAssignment, 1521 eBarcodeTest_LowTrace, 1522 eBarcodeTest_FrameShift, 1523 eBarcodeTest_StructuredSpecimenVoucher, 1524 eBarcodeTest_LAST 1525 } EBarcodeTest; 1526 1527 typedef struct barcodetestconfig 1528 { 1529 Boolean conf_list[eBarcodeTest_LAST]; 1530 Int4 min_length; 1531 FloatLo min_n_percent; 1532 Boolean require_keyword; 1533 } BarcodeTestConfigData, PNTR BarcodeTestConfigPtr; 1534 1535 extern BarcodeTestConfigPtr BarcodeTestConfigNew(); 1536 extern BarcodeTestConfigPtr BarcodeTestConfigFree (BarcodeTestConfigPtr cfg); 1537 1538 extern CharPtr GetBarcodeTestName (Int4 i); 1539 1540 extern Int4 GetBarcodeTestNumFromBarcodeTestName (CharPtr test_name); 1541 1542 typedef struct barcodetestresults 1543 { 1544 Boolean failed_tests[eBarcodeTest_LAST]; 1545 BioseqPtr bsp; 1546 FloatLo n_percent; 1547 Int4 num_trace; 1548 } BarcodeTestResultsData, PNTR BarcodeTestResultsPtr; 1549 1550 extern BarcodeTestResultsPtr BarcodeTestResultsNew (); 1551 extern BarcodeTestResultsPtr BarcodeTestResultsFree (BarcodeTestResultsPtr res); 1552 extern BarcodeTestResultsPtr BarcodeTestResultsCopy (BarcodeTestResultsPtr res); 1553 extern ValNodePtr BarcodeTestResultsListFree (ValNodePtr res_list); 1554 extern ValNodePtr BarcodeTestResultsExtractPass (ValNodePtr PNTR res_list); 1555 1556 extern Boolean IsBarcodeID (SeqIdPtr sip); 1557 1558 extern CharPtr BarcodeTestBarcodeIdString (BioseqPtr bsp); 1559 extern CharPtr BarcodeTestGenbankIdString (BioseqPtr bsp); 1560 1561 /* This one gets discrepancies by category */ 1562 extern ValNodePtr GetBarcodeDiscrepancies (ValNodePtr sep_list, BarcodeTestConfigPtr cfg); 1563 extern ValNodePtr GetBarcodePassFail (SeqEntryPtr sep, BarcodeTestConfigPtr cfg); 1564 NLM_EXTERN CharPtr GetBarcodeTestFailureReasons (BarcodeTestResultsPtr res); 1565 /* This one lists passes and failures, with reasons for failures */ 1566 extern void WriteBarcodeTestComprehensive (FILE *fp, ValNodePtr results_list); 1567 extern void WriteBarcodeDiscrepancies (FILE *fp, ValNodePtr results_list); 1568 extern void WriteBarcodeFailureReport (FILE *fp, ValNodePtr results_list); 1569 extern void WriteBarcodeTestCompliance (FILE *fp, ValNodePtr results_list); 1570 extern void WriteBarcodeTestComplianceEx (FILE *fp, ValNodePtr results_list, Boolean low_trace_fail); 1571 extern void WriteBarcodeTagTable (FILE *fp, ValNodePtr results_list); 1572 NLM_EXTERN Boolean IsIBOL (BioseqPtr bsp); 1573 1574 NLM_EXTERN Boolean 1575 BarcodeValidateOneSeqEntry 1576 (FILE *ofp, 1577 SeqEntryPtr sep, 1578 Boolean show_all, 1579 Boolean use_xml, 1580 Boolean show_header, 1581 CharPtr xml_header_text); 1582 extern void RemoveBarcodeTech (FILE *fp, ValNodePtr results_list); 1583 extern void RemoveBarcodeKeywords (FILE *fp, ValNodePtr results_list); 1584 extern void ApplyBarcodeKeywords (FILE *fp, ValNodePtr results_list); 1585 extern void ApplyBarcodeTech (FILE *fp, ValNodePtr results_list); 1586 extern Boolean PassBarcodeTests (BarcodeTestResultsPtr res); 1587 extern Boolean HasBARCODETech (BioseqPtr bsp); 1588 NLM_EXTERN void ApplyBarcodeKeywordToBioseq (BioseqPtr bsp); 1589 NLM_EXTERN Boolean BioseqHasBarcodeKeyword (BioseqPtr bsp); 1590 NLM_EXTERN Boolean BioseqHasKeyword (BioseqPtr bsp, CharPtr keyword); 1591 NLM_EXTERN void RemoveBarcodeKeywordsFromObjectList (FILE *fp, ValNodePtr object_list); 1592 NLM_EXTERN Boolean RemoveBarcodeTechFromBioseq (BioseqPtr bsp); 1593 extern Int4 CountPolymorphismsInBioseq (BioseqPtr bsp); 1594 NLM_EXTERN Boolean RemoveBarcodeKeywordFromBioseq (BioseqPtr bsp); 1595 1596 1597 extern CharPtr ExpandDiscrepancyReportTestsFromString (CharPtr list, Boolean expand, DiscReportOutputConfigPtr dcp); 1598 extern void CollateDiscrepancyReports (ValNodePtr PNTR discrepancy_reports); 1599 extern void WriteAsnDiscReport (ValNodePtr discrepancy_list, FILE *ofp, DiscReportOutputConfigPtr oc, Boolean use_flag); 1600 1601 1602 /* extern to allow access to subsource_subtype_alist */ 1603 typedef struct Nlm_qual_name_assoc { 1604 Nlm_CharPtr name; 1605 Uint1 value; 1606 } Nlm_QualNameAssoc, PNTR Nlm_QualNameAssocPtr, Nlm_QualNameAlist[]; 1607 1608 typedef struct Nlm_name_name_assoc { 1609 Nlm_CharPtr name; 1610 Nlm_CharPtr alias; 1611 Uint1 value; 1612 } Nlm_NameNameAssoc, PNTR Nlm_NameNameAssocPtr, Nlm_NameNameAlist[]; 1613 1614 extern Nlm_QualNameAssoc current_orgmod_subtype_alist[]; 1615 extern Nlm_QualNameAssoc discouraged_orgmod_subtype_alist[]; 1616 extern Nlm_QualNameAssoc discontinued_orgmod_subtype_alist[]; 1617 extern Nlm_NameNameAssoc orgmod_aliases[]; 1618 extern CharPtr GetOrgModQualName (Uint1 subtype); 1619 extern void BioSourceHasOldOrgModQualifiers (BioSourcePtr biop, BoolPtr has_discouraged, BoolPtr has_discontinued); 1620 NLM_EXTERN void StringHasOrgModPrefix (CharPtr str, CharPtr PNTR pval, Uint1Ptr p_subtypeval, Boolean skippref); 1621 NLM_EXTERN CharPtr StringHasPrefix (CharPtr str, CharPtr pref, Boolean novalneeded, Boolean skippref); 1622 1623 extern Nlm_QualNameAssoc current_subsource_subtype_alist []; 1624 extern Nlm_QualNameAssoc discouraged_subsource_subtype_alist[]; 1625 extern Nlm_QualNameAssoc discontinued_subsource_subtype_alist[]; 1626 extern Nlm_NameNameAssoc subsource_aliases []; 1627 extern CharPtr GetSubsourceQualName (Uint1 subtype); 1628 extern void BioSourceHasOldSubSourceQualifiers (BioSourcePtr biop, BoolPtr has_discouraged, BoolPtr has_discontinued); 1629 extern Boolean GeneRefMatch (GeneRefPtr grp1, GeneRefPtr grp2); 1630 extern Boolean DbxrefsMatch (ValNodePtr vnp1, ValNodePtr vnp2, Boolean case_sensitive); 1631 extern Boolean XrefsMatch (SeqFeatXrefPtr x1, SeqFeatXrefPtr x2); 1632 extern Boolean ProtRefMatch (ProtRefPtr prp1, ProtRefPtr prp2); 1633 1634 extern void IsCorrectLatLonFormat (CharPtr lat_lon, BoolPtr format_correct, BoolPtr precision_correct, BoolPtr lat_in_range, BoolPtr lon_in_range); 1635 extern CharPtr FixLatLonFormat (CharPtr orig_lat_lon); 1636 extern Boolean ParseLatLon (CharPtr lat_lon, FloatHi PNTR latP, FloatHi PNTR lonP); 1637 extern void ApplyBarcodeDbxrefsToBioseq (BioseqPtr bsp, Pointer data); 1638 extern void ApplyFBOLDbxrefsToBioseq (BioseqPtr bsp, Pointer data); 1639 1640 extern CharPtr GetCountryFix (CharPtr country, CharPtr PNTR country_list); 1641 1642 extern CharPtr ncrnaClassList[]; 1643 extern Int4 NcrnaOTHER; 1644 extern Boolean IsStringInNcRNAClassList (CharPtr str); 1645 extern Boolean IsStringInRegulatoryClassList (CharPtr str); 1646 extern Boolean IsStringInRecombinationClassList (CharPtr str); 1647 extern ValNodePtr ListFeaturesInLocation (BioseqPtr bsp, SeqLocPtr slp, Uint1 seqfeatChoice, Uint1 featdefChoice); 1648 extern ValNodePtr ListCodingRegionsContainedInSourceFeatures (SeqEntryPtr sep); 1649 extern ValNodePtr ListFeaturesOverlappingLocationEx (BioseqPtr bsp, SeqLocPtr slp, Uint1 seqfeatChoice, Uint1 featdefChoice, ValNodePtr constraint); 1650 extern ValNodePtr ListFeaturesOverlappingLocation (BioseqPtr bsp, SeqLocPtr slp, Uint1 seqfeatChoice, Uint1 featdefChoice); 1651 1652 extern void ConvertSourceFeatDescProc (SeqFeatPtr sfp, Pointer userdata); 1653 1654 /* for correcting capitalization */ 1655 NLM_EXTERN void 1656 FixCapitalizationInElement 1657 (CharPtr PNTR pEl, 1658 Boolean bAbbrev, 1659 Boolean bShortWords, 1660 Boolean bApostrophes); 1661 1662 NLM_EXTERN void FixCapitalizationInAuthor (AuthorPtr pAuthor); 1663 NLM_EXTERN void FixCapsInPubAffil (AffilPtr affil); 1664 NLM_EXTERN void FixCapsInPubAffilEx (AffilPtr affil, Boolean punct_only); 1665 NLM_EXTERN void FixCapitalizationInCountryString (CharPtr PNTR pCountry); 1666 NLM_EXTERN void FixCapitalizationInCountryStringEx (CharPtr PNTR pCountry, Boolean punct_only); 1667 NLM_EXTERN void FixStateAbbreviationsInAffil (AffilPtr affil, LogInfoPtr lip); 1668 1669 NLM_EXTERN void FixAffiliationShortWordsInElement (CharPtr PNTR pEl); 1670 NLM_EXTERN void FixKnownAbbreviationsInElement (CharPtr PNTR pEl); 1671 1672 NLM_EXTERN void FixAbbreviationsInElement (CharPtr PNTR pEl); 1673 NLM_EXTERN void FixOrgNamesInString (CharPtr str, ValNodePtr org_names); 1674 NLM_EXTERN void ResetCapitalization (Boolean first_is_upper, CharPtr pString); 1675 1676 NLM_EXTERN SeqIdPtr CreateSeqIdFromText (CharPtr id_str, SeqEntryPtr sep); 1677 NLM_EXTERN SeqLocPtr SeqLocWholeNew (BioseqPtr bsp); 1678 NLM_EXTERN Int4 GetDeltaSeqLen (DeltaSeqPtr dsp); 1679 NLM_EXTERN DeltaSeqPtr GetDeltaSeqForPosition(Int4 pos, BioseqPtr bsp, Int4Ptr pStart); 1680 1681 typedef SeqAlignPtr (*GlobalAlignFunc) PROTO ((BioseqPtr, BioseqPtr, BoolPtr)); 1682 1683 typedef enum { 1684 eAdjustFeatForGap_unknown_gaps = 0x01, 1685 eAdjustFeatForGap_known_gaps = 0x02, 1686 eAdjustFeatForGap_make_partial = 0x04, 1687 eAdjustFeatForGap_partial_for_pseudo = 0x08, 1688 eAdjustFeatForGap_trim_ends = 0x10, 1689 eAdjustFeatForGap_split_internal = 0x20, 1690 eAdjustFeatForGap_split_in_intron = 0x40 1691 } EAdjustFeatForGap; 1692 1693 typedef struct adjustfeatforgap { 1694 ValNodePtr feature_list; 1695 Uint4 options; 1696 GlobalAlignFunc align_func; 1697 ValNodePtr features_in_gap; 1698 } AdjustFeatForGapData, PNTR AdjustFeatForGapPtr; 1699 1700 NLM_EXTERN AdjustFeatForGapPtr AdjustFeatForGapFree (AdjustFeatForGapPtr agp); 1701 NLM_EXTERN Boolean FeatureOkForFeatureList (SeqFeatPtr sfp, ValNodePtr feature_list); 1702 NLM_EXTERN void 1703 LocationContainsGaps 1704 (SeqLocPtr slp, 1705 BioseqPtr bsp, 1706 Uint4 options, 1707 BoolPtr terminal_gaps, 1708 BoolPtr internal_gaps, 1709 BoolPtr entirely_in_gap); 1710 1711 NLM_EXTERN void SetPartialsAfterSplittingAtGap (SeqLocPtr before, SeqLocPtr after, Boolean set_partial_ends, Boolean partial5, Boolean partial3); 1712 NLM_EXTERN void AdjustFeatureForGapsCallback (SeqFeatPtr sfp, Pointer data); 1713 NLM_EXTERN void MarkFeaturesInGapsForDeletion (AdjustFeatForGapPtr afgp); 1714 NLM_EXTERN void AdjustCDSLocationsForUnknownGapsCallback (SeqFeatPtr sfp, Pointer data); 1715 NLM_EXTERN Boolean GapInLocation (Int4 seq_offset, Int4 length, SeqLocPtr loc); 1716 NLM_EXTERN BioseqPtr 1717 AddProteinSequenceCopy 1718 (BioseqPtr protbsp, 1719 BioseqPtr featbsp, 1720 SeqFeatPtr new_sfp, 1721 Uint2 entityID); 1722 NLM_EXTERN void AdjustFrame (SeqFeatPtr sfp, BioseqPtr oldprot); 1723 NLM_EXTERN void SetProductSequencePartials (BioseqPtr protbsp, Boolean partial5, Boolean partial3); 1724 NLM_EXTERN void AddCDSGapComment (SeqFeatPtr sfp); 1725 1726 1727 NLM_EXTERN Boolean SeqEdFixProteinFeatures (BioseqPtr oldbsp, BioseqPtr newbsp, Boolean force_fix, GlobalAlignFunc align_func); 1728 NLM_EXTERN void SeqEdTranslateOneCDS (SeqFeatPtr sfp, BioseqPtr featbsp, Uint2 entityID, GlobalAlignFunc align_func); 1729 NLM_EXTERN void SeqEdRemapLocation (SeqAlignPtr salp, SeqLocPtr slp, Int4 seq_len); 1730 1731 NLM_EXTERN CharPtr GetStateAbbreviation (CharPtr state); 1732 1733 typedef SeqAlignPtr (*LocalAlignFunc) PROTO ((BioseqPtr, BioseqPtr)); 1734 1735 extern void ReverseAlignmentStrand (SeqAlignPtr salp, Int4 nth); 1736 1737 NLM_EXTERN SeqAlignPtr SortPairwiseAlignmentsByFirstSeqRange (SeqAlignPtr salp); 1738 NLM_EXTERN ValNodePtr ReportCoverageForBioseqSeqHist (BioseqPtr bsp); 1739 1740 NLM_EXTERN void ConvertLocalIdsToBarcodeIds (SeqEntryPtr sep); 1741 1742 NLM_EXTERN ValNodePtr MakeTokensFromLine (CharPtr line); 1743 1744 NLM_EXTERN SeqFeatPtr GetGeneForFeature (SeqFeatPtr sfp); 1745 NLM_EXTERN SeqFeatPtr GetmRNAforCDS (SeqFeatPtr cds); 1746 NLM_EXTERN SeqFeatPtr GetCDSformRNA (SeqFeatPtr mrna); 1747 1748 NLM_EXTERN Boolean IsStringInSpanInList (CharPtr str, CharPtr list); 1749 1750 NLM_EXTERN void ParseGoTermsFromFields (SeqEntryPtr sep); 1751 1752 /* for autodef */ 1753 typedef enum { 1754 RemovableExon = 0, 1755 RemovableIntron, 1756 Removable5UTR, 1757 Removable3UTR, 1758 RemovableuORF, 1759 RemovableCDS, 1760 RemovablePromoter, 1761 RemovableLTR, 1762 RemovableNoncodingProductFeat, 1763 RemovableMobileElement, 1764 RemovablePrecursorRNA, 1765 RemovablencRNA, 1766 RemovableRepeatRegion, 1767 NumRemovableItems 1768 } RemovableList; 1769 NLM_EXTERN CharPtr GetRemovableItemName (Int4 i); 1770 1771 typedef enum { 1772 DEFLINE_USE_FEATURES = 1, 1773 DEFLINE_COMPLETE_SEQUENCE, 1774 DEFLINE_PARTIAL_SEQUENCE, 1775 DEFLINE_COMPLETE_GENOME, 1776 DEFLINE_PARTIAL_GENOME, 1777 DEFLINE_SEQUENCE 1778 } DefLineType; 1779 1780 typedef struct deflinefeaturerequestlist { 1781 Boolean keep_items[NumRemovableItems]; 1782 Boolean add_fake_promoters; 1783 Boolean suppress_alt_splice_phrase; 1784 Boolean remove_subfeatures; 1785 DefLineType feature_list_type; 1786 Int4 misc_feat_parse_rule; 1787 Boolean suppress_locus_tags; 1788 ValNodePtr suppressed_feature_list; 1789 Boolean use_ncrna_note; 1790 Boolean suppress_allele; 1791 } DeflineFeatureRequestList, PNTR DeflineFeatureRequestListPtr; 1792 1793 NLM_EXTERN void InitFeatureRequests (DeflineFeatureRequestListPtr feature_requests); 1794 1795 1796 /* ModifierItemLocalData is used to store information about the results of 1797 * a search of the set of organisms in a record and the results of user 1798 * input to a dialog for deciding which modifiers should be used in the 1799 * organism description. 1800 */ 1801 typedef struct modifieritemlocal { 1802 /* ButtoN button; */ 1803 Boolean any_present; 1804 Boolean all_present; 1805 Boolean is_unique; 1806 CharPtr first_value_seen; 1807 ValNodePtr values_seen; 1808 Boolean all_unique; 1809 CharPtr status; 1810 Boolean required; 1811 } ModifierItemLocalData, PNTR ModifierItemLocalPtr; 1812 1813 typedef enum { 1814 DEFLINE_POS_Bio_material = 0, 1815 DEFLINE_POS_Biotype, 1816 DEFLINE_POS_Biovar, 1817 DEFLINE_POS_Breed, 1818 DEFLINE_POS_Cell_line, 1819 DEFLINE_POS_Chemovar, 1820 DEFLINE_POS_Chromosome, 1821 DEFLINE_POS_Clone, 1822 DEFLINE_POS_Country, 1823 DEFLINE_POS_Cultivar, 1824 DEFLINE_POS_Culture_collection, 1825 DEFLINE_POS_Dev_stage, 1826 DEFLINE_POS_Ecotype, 1827 DEFLINE_POS_Endogenous_virus_name, 1828 DEFLINE_POS_Genotype, 1829 DEFLINE_POS_Haplogroup, 1830 DEFLINE_POS_Haplotype, 1831 DEFLINE_POS_Isolate, 1832 DEFLINE_POS_Linkage_group, 1833 DEFLINE_POS_Map, 1834 DEFLINE_POS_Pathovar, 1835 DEFLINE_POS_Plasmid_name, 1836 DEFLINE_POS_Pop_variant, 1837 DEFLINE_POS_Segment, 1838 DEFLINE_POS_Serogroup, 1839 DEFLINE_POS_Serotype, 1840 DEFLINE_POS_Serovar, 1841 DEFLINE_POS_Specimen_voucher, 1842 DEFLINE_POS_Strain, 1843 DEFLINE_POS_Subclone, 1844 DEFLINE_POS_Substrain, 1845 DEFLINE_POS_Transgenic 1846 } DefLinePos; 1847 1848 NLM_EXTERN Int4 GetDeflinePosForFieldName(CharPtr name); 1849 NLM_EXTERN Int4 GetDeflinePosForFieldType (ValNodePtr field); 1850 1851 /* ModifierItemGlobalData is used to store information about the available 1852 * modifiers - the name to use when displaying a list of checkboxes, whether 1853 * the modifier is an Organism modifier or a Source modifier, the subtype 1854 * to use when looking for the modifier in the organism qualifier list, 1855 * and whether this modifier is required by default. 1856 */ 1857 typedef struct modifieritemglobal { 1858 CharPtr name; 1859 Boolean isOrgMod; 1860 Uint1 subtype; 1861 } ModifierItemGlobalData, PNTR ModifierItemGlobalPtr; 1862 1863 extern ModifierItemGlobalData DefLineModifiers[]; 1864 NLM_EXTERN size_t NumDefLineModifiers (void); 1865 1866 /* OrganismDescriptionModifiers is used to apply specific user preferences 1867 * for how to construct the organism descriptions - whether or not to use 1868 * labels for the modifiers, whether and how to limit the number of modifiers 1869 * used in any one organism description, whether to keep or remove modifier 1870 * information in parentheses in the organism taxonomy name, and whether or 1871 * not to apply modifiers to organisms with "sp." in the middle of the 1872 * taxonomy name. 1873 */ 1874 typedef struct organismdescriptionmodifiers { 1875 Boolean use_labels; 1876 Int2 max_mods; 1877 Boolean keep_paren; 1878 Boolean exclude_sp; 1879 Boolean exclude_cf; 1880 Boolean exclude_aff; 1881 Boolean exclude_nr; 1882 Boolean include_country_extra; 1883 Int4 clone_isolate_HIV_rule_num; 1884 Boolean use_modifiers; 1885 Boolean allow_semicolon_in_modifier; 1886 Boolean allow_mod_at_end_of_taxname; 1887 } OrganismDescriptionModifiers, PNTR OrganismDescriptionModifiersPtr; 1888 1889 NLM_EXTERN Boolean ShouldExcludeSp (SeqEntryPtr sep); 1890 NLM_EXTERN void InitOrganismDescriptionModifiers(OrganismDescriptionModifiersPtr odmp, SeqEntryPtr sep); 1891 1892 /* These values are used for the clone_isolate_HIV_rule_num value in OrganismDescriptionModifiers */ 1893 typedef enum { 1894 clone_isolate_HIV_rule_prefer_clone = 1, 1895 clone_isolate_HIV_rule_prefer_isolate, 1896 clone_isolate_HIV_rule_want_both 1897 } clone_isolate_HIV_rule_values; 1898 1899 1900 typedef struct sourcequaldesc 1901 { 1902 CharPtr name; 1903 Boolean isOrgMod; 1904 Uint1 subtype; 1905 Uint1 subfield; 1906 } SourceQualDescData, PNTR SourceQualDescPtr; 1907 1908 NLM_EXTERN int LIBCALLBACK SortVnpBySourceQualDesc (VoidPtr ptr1, VoidPtr ptr2); 1909 1910 NLM_EXTERN void SetRequiredModifiers (ModifierItemLocalPtr modList); 1911 NLM_EXTERN void CountModifiers (ModifierItemLocalPtr ItemList, SeqEntryPtr sep); 1912 NLM_EXTERN ValNodePtr FindBestModifiersEx(SeqEntryPtr sep, ModifierItemLocalPtr ItemList, Boolean use_new); 1913 NLM_EXTERN ValNodePtr FindBestModifiers(SeqEntryPtr sep, ModifierItemLocalPtr ItemList); 1914 NLM_EXTERN ValNodePtr FindBestModifiersForDeflineClauseList (ValNodePtr defline_clauses, ModifierItemLocalPtr ItemList); 1915 1916 NLM_EXTERN ValNodePtr GetModifierIndicesFromModList (ModifierItemLocalPtr modList); 1917 extern void TestFindBestQualCombo (FILE *fp); 1918 1919 1920 NLM_EXTERN CharPtr MergeValNodeStrings (ValNodePtr list, Boolean useReturn); 1921 1922 NLM_EXTERN ValNodePtr FindExactStringListMatch (ValNodePtr list, CharPtr value); 1923 1924 NLM_EXTERN void BuildDefLineFeatClauseList 1925 ( SeqEntryPtr sep, 1926 Uint2 entityID, 1927 DeflineFeatureRequestList PNTR feature_requests, 1928 Int2 product_flag, 1929 Boolean alternate_splice_flag, 1930 Boolean gene_cluster_opp_strand, 1931 ValNodePtr PNTR list); 1932 1933 NLM_EXTERN Boolean AreFeatureClausesUnique (ValNodePtr list); 1934 NLM_EXTERN void DefLineFeatClauseListFree (ValNodePtr vnp); 1935 1936 NLM_EXTERN void 1937 BuildDefinitionLinesFromFeatureClauseLists 1938 (ValNodePtr list, 1939 ModifierItemLocalPtr modList, 1940 ValNodePtr modifier_indices, 1941 OrganismDescriptionModifiersPtr odmp); 1942 1943 NLM_EXTERN void 1944 BuildDefLinesFromFeatClauseListsForOneBsp 1945 (ValNodePtr list, 1946 ModifierItemLocalPtr modList, 1947 ValNodePtr modifier_indices, 1948 OrganismDescriptionModifiersPtr odmp, 1949 BioseqPtr bsp); 1950 1951 NLM_EXTERN void 1952 AutoDefForSeqEntry 1953 (SeqEntryPtr sep, 1954 Uint2 entityID, 1955 OrganismDescriptionModifiersPtr odmp, 1956 ModifierItemLocalPtr modList, 1957 ValNodePtr modifier_indices, 1958 DeflineFeatureRequestListPtr feature_requests, 1959 Int2 product_flag, 1960 Boolean alternate_splice_flag, 1961 Boolean gene_cluster_opp_strand); 1962 1963 NLM_EXTERN void 1964 AutoDefForSeqEntryEx 1965 (SeqEntryPtr sep, 1966 Uint2 entityID, 1967 OrganismDescriptionModifiersPtr odmp, 1968 ModifierItemLocalPtr modList, 1969 ValNodePtr modifier_indices, 1970 DeflineFeatureRequestListPtr feature_requests, 1971 Int2 product_flag, 1972 Boolean alternate_splice_flag, 1973 Boolean gene_cluster_opp_strand, 1974 Boolean update_options); 1975 1976 NLM_EXTERN void RegenerateAutoDef(BioseqPtr bsp); 1977 NLM_EXTERN void RemoveAutodefObjects(SeqEntryPtr sep); 1978 NLM_EXTERN void RemoveAutodefObjectsForDesc(SeqDescPtr sdp); 1979 1980 NLM_EXTERN void AddPopsetTitles 1981 (SeqEntryPtr sep, 1982 DeflineFeatureRequestListPtr feature_requests, 1983 Int2 product_flag, 1984 Boolean alternate_splice_flag, 1985 Boolean gene_cluster_opp_strand); 1986 1987 NLM_EXTERN void RemovePopsetTitles(SeqEntryPtr sep); 1988 1989 NLM_EXTERN UserObjectPtr MakeAutoDefOptionsUserObject 1990 (OrganismDescriptionModifiersPtr odmp, 1991 ModifierItemLocalPtr modList, 1992 ValNodePtr modifier_indices, 1993 DeflineFeatureRequestListPtr feature_requests, 1994 Int2 product_flag, 1995 Boolean alternate_splice_flag, 1996 Boolean gene_cluster_opp_strand); 1997 1998 NLM_EXTERN void AddAutoDefUserObjectToSeqEntry(SeqEntryPtr sep, UserObjectPtr uop); 1999 2000 NLM_EXTERN void DoTbl2AsnAutoDef(SeqEntryPtr sep, Uint2 entityID); 2001 2002 typedef struct popsetretrostat { 2003 Int4 feature_clause; 2004 Int4 common_title; 2005 Int4 uncalculatable; 2006 Boolean title_added; 2007 } PopSetRetroStatData, PNTR PopSetRetroStatPtr; 2008 2009 NLM_EXTERN void PopSetAutoDefRetro (SeqEntryPtr sep, PopSetRetroStatPtr stat); 2010 2011 NLM_EXTERN Boolean IsSpName (CharPtr taxName); 2012 2013 #define DEFAULT_ORGANELLE_CLAUSE 10 2014 NLM_EXTERN BioSourcePtr GetBiopForBsp (BioseqPtr bsp); 2015 NLM_EXTERN Boolean IsLocAInBonSameStrand (SeqLocPtr slp1, SeqLocPtr slp2); 2016 NLM_EXTERN void CleanUpTaxName (CharPtr taxName, Boolean keep_in_paren); 2017 NLM_EXTERN Boolean UseOrgModifier (OrgModPtr mod, CharPtr taxName, Boolean allow_at_end); 2018 NLM_EXTERN Boolean UseSubSrcModifier (SubSourcePtr ssp, CharPtr taxName, Boolean allow_at_end); 2019 NLM_EXTERN void AddModifierLabel 2020 ( Boolean use_labels, 2021 Boolean is_orgmod, 2022 Uint1 subtype, 2023 CharPtr modifier_text); 2024 NLM_EXTERN Boolean LIBCALLBACK IsMobileElement (SeqFeatPtr sfp); 2025 NLM_EXTERN void RemoveNucProtSetTitles (SeqEntryPtr sep); 2026 NLM_EXTERN void RemoveMRnaTitles (SeqEntryPtr sep); 2027 NLM_EXTERN void RemoveProteinTitles (SeqEntryPtr sep); 2028 NLM_EXTERN void SetAutoDefIDModifiers (ModifierItemLocalPtr modList); 2029 2030 2031 NLM_EXTERN ValNodePtr ReadTabTableFromFile (FILE *fp); 2032 NLM_EXTERN ValNodePtr FlipTabTableAxes (ValNodePtr row_list); 2033 NLM_EXTERN ValNodePtr FreeTabTable (ValNodePtr row_list); 2034 NLM_EXTERN ValNodePtr CopyTabTable (ValNodePtr row_list); 2035 NLM_EXTERN void WriteTabTableToFile (ValNodePtr table, FILE *fp); 2036 NLM_EXTERN ValNodePtr CountTabTableBlanks (ValNodePtr row_list); 2037 NLM_EXTERN ValNodePtr ScanTabTableForSpecialCharacters (ValNodePtr row_list); 2038 NLM_EXTERN ValNodePtr AutoReplaceSpecialCharactersInText (CharPtr PNTR text); 2039 NLM_EXTERN void AutoReplaceSpecialCharactersWithMessage (CharPtr PNTR text); 2040 NLM_EXTERN ValNodePtr AutoReplaceSpecialCharactersInTabTable (ValNodePtr row_list); 2041 NLM_EXTERN void AutoFixSpecialCharactersInEntity (Uint2 entityID); 2042 2043 NLM_EXTERN void RemoveQuotesFromTabTable (ValNodePtr row_list); 2044 NLM_EXTERN void ReparseTabTableConvertFirstSpaceToTab (ValNodePtr row_list); 2045 NLM_EXTERN void ReparseTabTableConvertMultiSpaceToTab (ValNodePtr row_list); 2046 NLM_EXTERN void CombineTabTableColumns (ValNodePtr row_list, ValNodePtr column_pos, CharPtr delimiter); 2047 NLM_EXTERN void ReparseTabTableSeparateColumnAtDelimiter (ValNodePtr row_list, Char delimiter, Int4 col, Boolean stop_after_first); 2048 NLM_EXTERN void AddTextToTabTableColumn (ValNodePtr row_list, Int4 col, CharPtr text, Uint2 existing_text); 2049 NLM_EXTERN ValNodePtr ReadOneColumnList (CharPtr line); 2050 NLM_EXTERN ValNodePtr SortTableRowByAnyColumn (ValNodePtr table, Int4 column); 2051 NLM_EXTERN void AdjustInfluenzaSourceTable (ValNodePtr table); 2052 2053 NLM_EXTERN void SpecialCharFindWithContext (CharPtr PNTR strp, Pointer userdata, BoolPtr did_find, BoolPtr did_change); 2054 NLM_EXTERN ValNodePtr FreeContextList (ValNodePtr context_list); 2055 2056 typedef struct twostringhash { 2057 CharPtr PNTR table; 2058 Int4 num_lines; 2059 } TwoStringHashData, PNTR TwoStringHashPtr; 2060 2061 NLM_EXTERN TwoStringHashPtr TwoStringHashFree (TwoStringHashPtr tsh); 2062 NLM_EXTERN TwoStringHashPtr MakeTwoStringHashFromTabTable (ValNodePtr line_list, Int4 column1, Int4 column2); 2063 NLM_EXTERN CharPtr GetValueFromTwoStringHash (CharPtr key, TwoStringHashPtr tsh); 2064 2065 NLM_EXTERN Int4 ExtendSeqLocToEnd (SeqLocPtr slp, BioseqPtr bsp, Boolean end5); 2066 2067 NLM_EXTERN void PromoteAllToBestID (SeqEntryPtr sep); 2068 NLM_EXTERN void PromoteAllToWorstID (SeqEntryPtr sep); 2069 NLM_EXTERN void RemoveAllVersionLocusGIFromID (SeqEntryPtr sep); 2070 2071 /* functions for converting features */ 2072 NLM_EXTERN Boolean IsBioseqSetInGPS (BioseqSetPtr bssp); 2073 NLM_EXTERN Boolean IsBioseqInGPS (BioseqPtr bsp); 2074 NLM_EXTERN Boolean IsFeatInGPS (SeqFeatPtr sfp); 2075 NLM_EXTERN void 2076 ApplyCDSOptionsToFeature 2077 (SeqFeatPtr sfp, 2078 Boolean remove_mRNA, 2079 Boolean remove_gene, 2080 Boolean remove_transcript_id, 2081 Boolean keep_original); 2082 2083 NLM_EXTERN Boolean 2084 ConvertCDSToRNA 2085 (SeqFeatPtr sfp, 2086 Uint2 rna_type); 2087 2088 NLM_EXTERN Boolean ConvertGeneToRNA (SeqFeatPtr sfp, Uint2 featdef_to); 2089 NLM_EXTERN Boolean ConvertBioSrcToRepeatRegion (SeqFeatPtr sfp, Uint2 featdef_to); 2090 NLM_EXTERN CharPtr SubSourceText (BioSourcePtr biop, Uint1 subtype, BoolPtr found); 2091 NLM_EXTERN CharPtr OrgModText (BioSourcePtr biop, Uint1 subtype, BoolPtr found); 2092 NLM_EXTERN CharPtr NoteText (BioSourcePtr biop, CharPtr comment); 2093 NLM_EXTERN Boolean ConvertNonPseudoCDSToMiscFeat (SeqFeatPtr sfp, Boolean viral); 2094 2095 NLM_EXTERN CharPtr GetImportFeatureName (Uint2 featdef_key); 2096 NLM_EXTERN RnaRefPtr RnaRefFromLabel (Uint2 featdef_to, CharPtr label, BoolPtr add_label_to_comment); 2097 2098 NLM_EXTERN SeqLocPtr GetProteinLocationForNucleotideFeatureConversion (SeqLocPtr nuc_slp, BoolPtr no_cds); 2099 NLM_EXTERN SeqLocPtr FindNucleotideLocationForProteinFeatureConversion (SeqLocPtr slp); 2100 NLM_EXTERN SeqLocPtr BuildProtLoc (SeqFeatPtr overlapping_cds, SeqLocPtr slp, Int4Ptr frame); 2101 NLM_EXTERN Boolean ConvertImpToProtFunc (SeqFeatPtr sfp, Uint2 featdef_to); 2102 NLM_EXTERN Boolean ConvertProtToImpFunc (SeqFeatPtr sfp, Uint2 featdef_to); 2103 NLM_EXTERN Boolean ConvertRegionToProtFunc (SeqFeatPtr sfp, Uint2 featdef_to); 2104 NLM_EXTERN Boolean ConvertRegionToImpFunc (SeqFeatPtr sfp, Uint2 featdef_to); 2105 NLM_EXTERN Boolean ConvertImpToImpFunc (SeqFeatPtr sfp, Uint2 featdef_to); 2106 NLM_EXTERN Boolean ConvertRegionToRNAFunc (SeqFeatPtr sfp, Uint2 featdef_to); 2107 NLM_EXTERN Boolean ConvertGeneToImpFeatFunc (SeqFeatPtr sfp, Uint2 featdef_to); 2108 NLM_EXTERN Boolean ConvertProtToProtFunc (SeqFeatPtr sfp, Uint2 featdef_to); 2109 NLM_EXTERN Boolean ConvertMiscFeatToGene (SeqFeatPtr sfp); 2110 NLM_EXTERN Boolean ConvertMiscFeatToCodingRegion (SeqFeatPtr sfp); 2111 NLM_EXTERN Boolean ConvertmRNAToCodingRegion (SeqFeatPtr sfp); 2112 NLM_EXTERN Boolean ConverttRNAToGene(SeqFeatPtr sfp); 2113 NLM_EXTERN void ExtraCDSCreationActions (SeqFeatPtr cds, SeqEntryPtr parent_sep); 2114 NLM_EXTERN SeqFeatPtr GetProtFeature (BioseqPtr protbsp); 2115 2116 NLM_EXTERN void InstantiateMatPeptideProducts (SeqEntryPtr sep); 2117 2118 NLM_EXTERN Boolean CodingRegionHasTranslExcept (SeqFeatPtr sfp); 2119 2120 NLM_EXTERN SeqEntryPtr SequenceStringToSeqEntry (CharPtr str, SeqIdPtr sip, Uint1 mol_type); 2121 2122 NLM_EXTERN void RevCompOneFeatForBioseq (SeqFeatPtr sfp, BioseqPtr bsp); 2123 NLM_EXTERN void RevCompFeats (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent); 2124 2125 /* for parsing collection dates */ 2126 NLM_EXTERN CharPtr ReformatDateStringEx (CharPtr orig_date, Boolean month_first, BoolPtr month_ambiguous); 2127 NLM_EXTERN CharPtr ReformatDateWithMonthNames (CharPtr orig_date); 2128 NLM_EXTERN Int4 GetYearFromToken (CharPtr token, Int4 token_len); 2129 NLM_EXTERN Int4 ReadNumberFromToken (CharPtr token, Int4 token_len); 2130 NLM_EXTERN CharPtr GetMonthFromToken (CharPtr token, Int4 token_len); 2131 NLM_EXTERN Int4 GetMonthNumFromAbbrev (CharPtr month_abbrev); 2132 NLM_EXTERN CharPtr GetMonthAbbrev (Int4 n); 2133 NLM_EXTERN Int4 GetDaysInMonth (Int4 n); 2134 2135 /* for reformatting assembly date */ 2136 NLM_EXTERN CharPtr AssemblyDateFromCollectionDate (CharPtr collection_date, Boolean ambiguous); 2137 NLM_EXTERN Boolean ReformatAssemblyDate (CharPtr PNTR orig_date); 2138 2139 NLM_EXTERN void CreateStructuredCommentsForAllFromTable (SeqEntryPtr sep, ValNodePtr header, ValNodePtr line, ValNodePtr PNTR err_list); 2140 NLM_EXTERN ValNodePtr CreateStructuredCommentsFromFile (FILE *fp, SeqEntryPtr sep, Boolean apply_to_all); 2141 NLM_EXTERN void AddDatabaseNameToStructuredComment (UserObjectPtr uop, CharPtr dbname); 2142 NLM_EXTERN ValNodePtr CreateStructuredCommentTableFromSeqEntry (SeqEntryPtr sep); 2143 2144 #define ALNMGR_GAP -2 2145 #define ALNMGR_ROW_UNDEFINED -1 2146 2147 NLM_EXTERN void 2148 AlignmentIntervalToString 2149 (SeqAlignPtr salp, 2150 Int4 row, 2151 Int4 start, 2152 Int4 stop, 2153 Int4 target_row, 2154 Boolean view_whole_entity, 2155 Uint1Ptr seqbuf, 2156 Uint1Ptr alnbuf, 2157 Int4 PNTR alnbuffer_len, 2158 Boolean show_substitutions); 2159 2160 extern void CountNsInSequence ( 2161 BioseqPtr bsp, 2162 Int4Ptr p_totalN, 2163 Int4Ptr p_totalDash, 2164 Int4Ptr p_totalTilde, 2165 Int4Ptr p_max_stretch, 2166 Boolean expand_gaps, 2167 Boolean no_stretch_in_assembly_gap 2168 ); 2169 NLM_EXTERN Boolean IsTSA (BioseqPtr bsp); 2170 2171 NLM_EXTERN Boolean IsPseudo (SeqFeatPtr sfp); 2172 2173 NLM_EXTERN Boolean ExtendPartialsToEndOrGap (SeqFeatPtr sfp); 2174 NLM_EXTERN Boolean RetranslateOneCDS 2175 ( SeqFeatPtr sfp, 2176 Uint2 entityID, 2177 Boolean include_stop, 2178 Boolean no_stop_at_end_of_complete_cds); 2179 NLM_EXTERN SeqFeatPtr FindBestProtein (Uint2 entityID, SeqLocPtr product); 2180 NLM_EXTERN void AddNonExtendableException (SeqFeatPtr sfp); 2181 NLM_EXTERN SeqLocPtr GetmRNALocationFromCDSLocation (SeqLocPtr slp, Uint2 entityID); 2182 NLM_EXTERN SeqFeatPtr AddmRNAForCDS (SeqFeatPtr sfp); 2183 NLM_EXTERN Boolean ProductsMatchForRefSeq (CharPtr cds_str, CharPtr mrna_str); 2184 NLM_EXTERN SeqSubmitPtr FindSeqSubmitForSeqEntry (SeqEntryPtr sep); 2185 NLM_EXTERN Boolean CreateMatPeptideFromCDS (SeqFeatPtr sfp); 2186 NLM_EXTERN Boolean ConvertCDSToMatPeptideForOverlappingCDS (SeqFeatPtr sfp, SeqFeatPtr top_cds, Boolean remove_original); 2187 NLM_EXTERN Boolean AutoConvertCDSToMiscFeat (SeqFeatPtr cds, Boolean remove_original); 2188 2189 NLM_EXTERN AuthListPtr PNTR GetAuthListForPub (PubPtr the_pub); 2190 NLM_EXTERN void RemoveConsortiumFromPub (PubPtr pub); 2191 2192 NLM_EXTERN Int4 Extend5PartialSeqIntToEndOrGap (SeqIntPtr sint, BioseqPtr bsp, Boolean short_only); 2193 NLM_EXTERN Int4 Extend3PartialSeqIntToEndOrGap (SeqIntPtr sint, BioseqPtr bsp, Boolean short_only); 2194 NLM_EXTERN Int4 ExtendSeqLocToEndOrGap (SeqLocPtr slp, BioseqPtr bsp, Boolean end5); 2195 NLM_EXTERN FloatLo PercentNInBioseq (BioseqPtr bsp, Boolean include_gaps); 2196 NLM_EXTERN FloatLo PercentNInBioseqInterval (BioseqPtr bsp, Int4 start, Int4 stop, Boolean include_gaps); 2197 2198 NLM_EXTERN SeqEntryPtr GetBestSeqEntryForItem (ValNodePtr vnp); 2199 NLM_EXTERN void AddNewUniqueDescriptors (SeqDescrPtr PNTR new_set, SeqDescrPtr parent_set); 2200 NLM_EXTERN void AddNewUniqueAnnotations (SeqAnnotPtr PNTR new_set, SeqAnnotPtr parent_set); 2201 NLM_EXTERN BioseqSetPtr MakeGroupsForUniqueValues (BioseqSetPtr bssp, ValNodePtr value_lists); 2202 NLM_EXTERN ValNodePtr PrepareSequenceListForSegregateByNumberOfSets (Int4 num_sets, SeqEntryPtr sep); 2203 NLM_EXTERN void SegregateSetsByNumber (SeqEntryPtr sep, Int4 num_sets); 2204 NLM_EXTERN ValNodePtr PrepareSequenceListForSegregateByNumberPerSet (Int4 num_per_set, SeqEntryPtr sep); 2205 NLM_EXTERN void SegregateSetsByNumberPerSet (SeqEntryPtr sep, Int4 num_per_set); 2206 2207 NLM_EXTERN void MoveSequencesFromSetToWrapper (ValNodePtr list, Uint2 entityID); 2208 2209 NLM_EXTERN ValNodePtr CreateStructuredCommentsFromRow (ValNodePtr header, ValNodePtr values, CharPtr id_str, ValNodePtr PNTR err_list); 2210 2211 NLM_EXTERN void MergeAdjacentAnnotsInList (SeqAnnotPtr sap); 2212 2213 NLM_EXTERN Boolean GetsDocsumTitle(Uint1 set_class); 2214 NLM_EXTERN void PromoteCommonTitlesToSet (SeqEntryPtr sep); 2215 2216 NLM_EXTERN void SetDescriptorPropagate (BioseqSetPtr bssp); 2217 2218 typedef Boolean (*DescriptorTestFunc) PROTO ((SeqDescPtr, Pointer)); 2219 NLM_EXTERN void PropagateSomeDescriptors (SeqEntryPtr sep, DescriptorTestFunc test_func, Pointer extradata); 2220 NLM_EXTERN void PropagateDblinkDescriptors (SeqEntryPtr sep); 2221 2222 NLM_EXTERN Boolean RemoveDuplicateNestedSetsForEntityID (Uint2 entityID); 2223 NLM_EXTERN Boolean RemoveDuplicateNestedSetsForEntityIDNoUpdate (Uint2 entityID); 2224 2225 NLM_EXTERN void AddStructuredCommentKeywords (Uint2 entityID); 2226 NLM_EXTERN CharPtr KeywordForStructuredCommentPrefix (CharPtr prefix); 2227 NLM_EXTERN CharPtr StructuredCommentPrefixForKeyword (CharPtr keyword); 2228 NLM_EXTERN CharPtr KeywordForStructuredCommentName (UserObjectPtr uop); 2229 NLM_EXTERN Boolean HasKeywordForStructuredCommentName (BioseqPtr bsp, UserObjectPtr uop); 2230 NLM_EXTERN Boolean HasAllKeywordsForStructuredComment (BioseqPtr bsp, CharPtr keyword); 2231 NLM_EXTERN Boolean HasAnyKeywordForStructuredComment (BioseqPtr bsp, CharPtr keyword); 2232 NLM_EXTERN ValNodePtr GetAllStructuredCommentKeywords (void); 2233 NLM_EXTERN void RemoveStructuredCommentKeywords (Uint2 entityID); 2234 NLM_EXTERN void RemoveAllStructuredCommentKeywords (Uint2 entityID); 2235 NLM_EXTERN ValNodePtr SplitStringAtSemicolon (CharPtr keyword); 2236 2237 NLM_EXTERN void ParseTaxNameToQuals (OrgRefPtr org, TextFsaPtr tags); 2238 2239 NLM_EXTERN ValNodePtr GetLocusTagPrefixList (SeqEntryPtr sep); 2240 2241 NLM_EXTERN Boolean IsProductNameOk (CharPtr product_name); 2242 NLM_EXTERN Boolean ReportProductNameProblems (CharPtr product_name, FILE *output_file, CharPtr prefix); 2243 NLM_EXTERN Boolean FixProductNameProblems (CharPtr PNTR product_name); 2244 2245 NLM_EXTERN SeqEntryPtr ReadFilteredAsn (FILE *fp, Boolean is_binary, CharPtr accn_list, Uint2Ptr entityIDptr); 2246 NLM_EXTERN void ReintegrateFilteredAsn (SeqEntryPtr sep, FILE *orig_file, FILE *output, Boolean is_binary); 2247 2248 typedef struct descstream { 2249 SeqDescPtr orig; 2250 SeqDescPtr replace; 2251 SeqIdPtr owners; 2252 SeqIdPtr last_owner; 2253 Boolean on_all; 2254 CharPtr text; 2255 Int4 num_dependent; 2256 } DescStreamData, PNTR DescStreamPtr; 2257 2258 NLM_EXTERN DescStreamPtr DescStreamNew (SeqDescPtr sdp, BioseqPtr parent); 2259 NLM_EXTERN DescStreamPtr DescStreamFree (DescStreamPtr ds); 2260 NLM_EXTERN ValNodePtr DescStreamListFree (ValNodePtr vnp); 2261 2262 NLM_EXTERN ValNodePtr StreamAsnForDescriptors (FILE *fp, Boolean is_binary, Boolean is_batch, Boolean is_submit, SeqIdPtr PNTR sip_list); 2263 NLM_EXTERN void WriteAsnWithReplacedDescriptors (ValNodePtr desc_stream_list, FILE *orig_file, FILE *output, Boolean is_binary, Boolean is_batch, Boolean is_submit); 2264 NLM_EXTERN Boolean IdListsMatch (SeqIdPtr sip_list, ValNodePtr all_sip); 2265 NLM_EXTERN void SetOnAllValsForDescStreamList (ValNodePtr desc_list, ValNodePtr all_sip); 2266 2267 extern Boolean ParseCodeBreak (SeqFeatPtr sfp, CharPtr val, Int4 offset); 2268 2269 NLM_EXTERN void CleanupOneSeqFeat (SeqFeatPtr sfp); 2270 2271 NLM_EXTERN Uint1 GetSpecialPlastidGenCode ( 2272 CharPtr taxname, 2273 CharPtr lineage 2274 ); 2275 2276 2277 NLM_EXTERN Boolean TrimPrimerSeqJunkInSeqEntry (SeqEntryPtr sep, FILE *log_fp); 2278 NLM_EXTERN Boolean FixUsaAndStateAbbreviations (Uint2 entityID, FILE *log_fp); 2279 NLM_EXTERN void AdjustSeqEntryForConsensusSplice (SeqEntryPtr sep); 2280 NLM_EXTERN Boolean AdjustSeqEntryForConsensusSpliceEx (SeqEntryPtr sep, FILE *log_fp, Boolean strict); 2281 2282 NLM_EXTERN void 2283 FixCapitalizationInTitle 2284 (CharPtr PNTR pTitle, 2285 Boolean first_is_upper, 2286 ValNodePtr org_names); 2287 2288 NLM_EXTERN Int4 ConvertCommentsWithSpacesToStructuredCommentsForSeqEntry (SeqEntryPtr sep); 2289 2290 NLM_EXTERN void ParseExtractorResultsTableToFeatures (FILE *fp, SeqEntryPtr sep); 2291 NLM_EXTERN void ParseRNAFeatListTableToFeatures (FILE *fp, SeqEntryPtr sep, LogInfoPtr lip); 2292 2293 2294 #ifdef OS_MSWIN 2295 NLM_EXTERN Int4 RunSilent(const char *cmdline); 2296 #endif 2297 2298 2299 NLM_EXTERN CharPtr ValNodeSeqIdName (ValNodePtr vnp); 2300 NLM_EXTERN void ValNodeSeqIdFree (ValNodePtr vnp); 2301 NLM_EXTERN ValNodePtr ValNodeSeqIdCopy (ValNodePtr vnp); 2302 NLM_EXTERN Boolean ValNodeSeqIdMatch (ValNodePtr vnp1, ValNodePtr vnp2); 2303 NLM_EXTERN ValNodePtr ValNodeSeqIdListFree (ValNodePtr list); 2304 NLM_EXTERN ValNodePtr ValNodeSeqIdListCopy (ValNodePtr list); 2305 NLM_EXTERN ValNodePtr SeqIdListToValNodeSeqIdList (SeqIdPtr sip_list); 2306 NLM_EXTERN SeqIdPtr ValNodeSeqIdListToSeqIdList (ValNodePtr vnp_list); 2307 2308 NLM_EXTERN void StringToLower (CharPtr str); 2309 2310 NLM_EXTERN ValNodePtr FixupCountryQuals (SeqEntryPtr sep, Boolean fix_after_colon); 2311 NLM_EXTERN Boolean FixupCountryQualsWithLog (SeqEntryPtr sep, Boolean fix_after_colon, FILE *log_fp); 2312 NLM_EXTERN Boolean FixupMouseStrains (SeqEntryPtr sep, FILE *log_fp); 2313 2314 NLM_EXTERN CharPtr StructuredCommentDbnameFromString (CharPtr string); 2315 NLM_EXTERN ValNodePtr GetStructuredCommentPrefixList (void); 2316 NLM_EXTERN void SetStructuredCommentPrefixAndSuffix (UserObjectPtr uop, CharPtr string); 2317 2318 extern ValNodePtr GetSourceQualDescListEx (Boolean get_subsrc, Boolean get_orgmod, Boolean get_discouraged, Boolean get_discontinued, Boolean get_subfields); 2319 2320 NLM_EXTERN Boolean RemoveCultureNotes (SeqEntryPtr sep); 2321 2322 NLM_EXTERN AuthListPtr GetAuthorListForPub (PubPtr the_pub); 2323 2324 NLM_EXTERN void FixProductWordCapitalization (CharPtr PNTR pProduct); 2325 NLM_EXTERN Boolean FixSrcQualCaps (SeqEntryPtr sep, Int4 src_qual, FILE *log_fp); 2326 NLM_EXTERN Boolean IsNCBIFileID (SeqIdPtr sip); 2327 2328 NLM_EXTERN Boolean IsLocationOrganelle (Uint1 genome); 2329 NLM_EXTERN Boolean IsBioseqOrganelle (BioseqPtr bsp); 2330 2331 NLM_EXTERN void RemoveFeatureLink (SeqFeatPtr sfp1, SeqFeatPtr sfp2); 2332 NLM_EXTERN void LinkTwoFeatures (SeqFeatPtr dst, SeqFeatPtr sfp); 2333 NLM_EXTERN void MakeFeatureXrefsFromProteinIdQuals (SeqEntryPtr sep); 2334 NLM_EXTERN void MakeFeatureXrefsFromTranscriptIdQuals (SeqEntryPtr sep); 2335 NLM_EXTERN void FinishHalfXrefs (SeqEntryPtr sep); 2336 NLM_EXTERN void FlipCodonRecognizedInSeqEntry (SeqEntryPtr sep, LogInfoPtr lip); 2337 NLM_EXTERN void RemoveBadCodonRecognizedInSeqEntry (SeqEntryPtr sep, LogInfoPtr lip); 2338 NLM_EXTERN Uint1 GetAaFromtRNA (tRNAPtr trp); 2339 NLM_EXTERN CharPtr GetCodesFortRNA (SeqFeatPtr sfp, Int2 *pCode); 2340 2341 NLM_EXTERN void ReverseBioseqInAlignment (SeqAlignPtr salp, Pointer userdata); 2342 NLM_EXTERN void FlipAlignment (SeqAlignPtr salp); 2343 NLM_EXTERN void FlipEntireAlignmentIfAllSequencesFlipped (SeqAnnotPtr sap, Pointer userdata); 2344 NLM_EXTERN ValNodePtr ListSequencesWithAlignments (ValNodePtr bsp_list); 2345 typedef Boolean (LIBCALL *BioseqFunc) (BioseqPtr); 2346 NLM_EXTERN void RevCompBioseqList (ValNodePtr bsp_list, 2347 Uint2 entityID, 2348 BioseqFunc func, 2349 Boolean revCompFeats, 2350 Boolean check_for_aln); 2351 NLM_EXTERN Boolean IsBioseqInAnyAlignment (BioseqPtr bsp, Uint2 input_entityID); 2352 NLM_EXTERN Boolean AreAnyElementsOfSetInAnyAlignment (BioseqSetPtr bssp, Uint2 input_entityID); 2353 NLM_EXTERN void RemoveAlignmentsWithSequence (BioseqPtr bsp, Uint2 input_entityID); 2354 NLM_EXTERN void RemoveAlignmentsWithElementsOfSet (BioseqSetPtr bssp, Uint2 input_entityID); 2355 2356 NLM_EXTERN void ReplaceComplexLocation (SeqLocPtr slp, SeqAlignPtr salp, Int4 new_len, Int4 begin, Int4 fin); 2357 NLM_EXTERN void ReplaceOneSequence (SeqAlignPtr salp, BioseqPtr oldbsp, BioseqPtr newbsp); 2358 NLM_EXTERN Boolean AreSequenceResiduesIdentical (BioseqPtr bsp1, BioseqPtr bsp2); 2359 NLM_EXTERN SeqAlignPtr AlignForSequenceUpdate (BioseqPtr bsp1, BioseqPtr bsp2, BoolPtr revcomp, GlobalAlignFunc align_func); 2360 NLM_EXTERN void AddCitSubToUpdatedSequence (BioseqPtr upd_bsp, Uint2 input_entityID, CharPtr update_txt); 2361 NLM_EXTERN ValNodePtr CreateUpdateCitSubFromBestTemplate (SeqEntryPtr top_sep, SeqEntryPtr upd_sep, CharPtr update_txt); 2362 NLM_EXTERN void RemoveQualityScores (BioseqPtr bsp, FILE *log_fp, BoolPtr data_in_log); 2363 NLM_EXTERN void ReplaceFakeIDWithIDFromTitle (BioseqPtr bsp); 2364 2365 typedef void (*Nlm_ImportSeqCallbackProc) PROTO ((Int4, Int4, Pointer)); 2366 2367 NLM_EXTERN SeqEntryPtr 2368 ImportNucleotideFASTASequencesFromFileEx 2369 (FILE *fp, 2370 Boolean parse_id, 2371 CharPtr supplied_id_txt, 2372 ValNodePtr PNTR err_msg_list, 2373 BoolPtr chars_stripped, 2374 Boolean allow_char_stripping, 2375 Nlm_ImportSeqCallbackProc callback, 2376 Pointer callback_data); 2377 2378 NLM_EXTERN SeqEntryPtr 2379 ImportNucleotideFASTASequencesFromFile 2380 (FILE *fp, 2381 Boolean parse_id, 2382 CharPtr supplied_id_txt, 2383 ValNodePtr PNTR err_msg_list, 2384 BoolPtr chars_stripped, 2385 Boolean allow_char_stripping); 2386 NLM_EXTERN SeqEntryPtr ImportProteinFASTASequences 2387 (FILE *fp, 2388 Boolean parse_id, 2389 CharPtr supplied_id_txt, 2390 ValNodePtr PNTR err_msg_list, 2391 BoolPtr chars_stripped); 2392 NLM_EXTERN void AddUniqueUpdateSequenceIDs (SeqEntryPtr sep); 2393 NLM_EXTERN void ListBioseqsInSeqEntry (SeqEntryPtr sep, Boolean is_na, Int4Ptr seq_num, ValNodePtr PNTR bioseq_list); 2394 NLM_EXTERN ValNodePtr ShuffleUpdateBioseqList (ValNodePtr PNTR update_bioseq_list, ValNodePtr orig_bioseq_list); 2395 NLM_EXTERN ValNodePtr GetNthValNode (ValNodePtr list, Int4 n); 2396 NLM_EXTERN ValNodePtr ExtractNthValNode (ValNodePtr PNTR list, Int4 nth); 2397 NLM_EXTERN BioseqPtr FindBioseqInList (ValNodePtr bioseq_list, SeqIdPtr sip, Int4Ptr position); 2398 NLM_EXTERN void ReplaceCollidingUpdateIDs (ValNodePtr update_bioseq_list, ValNodePtr orig_bioseq_list); 2399 NLM_EXTERN Boolean RelaxedSeqIdIn (SeqIdPtr sip, SeqIdPtr sip_list); 2400 NLM_EXTERN void RemoveSequencesWithoutUpdates (ValNodePtr PNTR orig_bioseq_list, ValNodePtr PNTR update_bioseq_list); 2401 extern CharPtr kSubmitterUpdateText; 2402 2403 NLM_EXTERN SeqLocPtr MakeGeneLocForFeatureLoc (SeqLocPtr floc, Uint2 entityID, Boolean trans_spliced); 2404 2405 typedef struct lclidlist { 2406 BioseqPtr firstbsp; 2407 SeqIdPtr firstsip; 2408 CharPtr key; 2409 Int2 count; 2410 struct lclidlist PNTR left; 2411 struct lclidlist PNTR right; 2412 } LclIdList, PNTR LclIdListPtr; 2413 2414 NLM_EXTERN void ResolveExistingIDsCallback (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent); 2415 NLM_EXTERN void FreeLclTree (LclIdListPtr PNTR head); 2416 NLM_EXTERN Boolean HasAlignmentsWithLocalIDs (SeqEntryPtr sep); 2417 NLM_EXTERN Boolean DoesStringContainPhrase (CharPtr str, CharPtr phrase, Boolean case_sensitive, Boolean whole_word); 2418 2419 NLM_EXTERN Int4 UpdateReplacedECNumbers (SeqEntryPtr sep); 2420 NLM_EXTERN Int4 UpdateReplacedECNumbersEx (SeqEntryPtr sep, ValNodePtr PNTR head, ValNodePtr PNTR tail, Boolean only_unambig, Boolean justwarn); 2421 NLM_EXTERN Int4 DeleteBadECNumbers (SeqEntryPtr sep); 2422 NLM_EXTERN Int4 DeleteBadECNumbersEx (SeqEntryPtr sep, ValNodePtr PNTR head, ValNodePtr PNTR tail, Boolean justwarn); 2423 2424 NLM_EXTERN void SegregateSetsByPlantGroup (SeqEntryPtr sep); 2425 NLM_EXTERN void SegregateSetsByFungusGroup (SeqEntryPtr sep); 2426 NLM_EXTERN ValNodePtr PrepareSequenceListForSegregateByBioseqList (SeqEntryPtr sep, ValNodePtr bsp_list); 2427 NLM_EXTERN void SegregateSetsByBioseqList (SeqEntryPtr sep, ValNodePtr vnp); 2428 NLM_EXTERN Boolean SeqEntryHasPairwiseAlignments (SeqEntryPtr sep); 2429 NLM_EXTERN int LIBCALLBACK SortVnpByChoiceAndPtrvalue (VoidPtr ptr1, VoidPtr ptr2); 2430 NLM_EXTERN int CompareSequences (BioseqPtr bsp1, BioseqPtr bsp2, Boolean allow_Ndiff); 2431 2432 NLM_EXTERN Int2 GetGenCodeForBsp (BioseqPtr bsp); 2433 2434 /* for unverified user objects */ 2435 typedef enum unverifiedtype { 2436 eUnverifiedType_Organism = 0, 2437 eUnverifiedType_Features , 2438 eUnverifiedType_Misassembled , 2439 eUnverifiedType_Max 2440 } UnverifiedMatchType; 2441 NLM_EXTERN CharPtr GetUnverifiedMatchName (Int4 unverified_type); 2442 2443 NLM_EXTERN CharPtr GetRepliconChromosomeName (BioSourcePtr biop); 2444 NLM_EXTERN CharPtr GetRepliconType (BioSourcePtr biop); 2445 NLM_EXTERN CharPtr GetRepliconLocation (BioSourcePtr biop); 2446 2447 NLM_EXTERN PubPtr ParsePubFromEndnote (FILE *fp); 2448 NLM_EXTERN CharPtr GetDefinitionLineFASTAModifiers (BioseqPtr bsp, Boolean include_subsource); 2449 NLM_EXTERN CharPtr GetDefinitionLineFASTAModifiersByList (BioseqPtr bsp, ValNodePtr list); 2450 2451 /* for finding frameshifts */ 2452 typedef enum { 2453 eFrameShiftReport_NoReport = 0, 2454 eFrameShiftReport_Intron = 1, 2455 eFrameShiftReport_Exon = 2, 2456 eFrameShiftReport_ExonMult3 = 3 2457 } EFrameShiftReport; 2458 2459 typedef struct frameshiftreport { 2460 CharPtr msg; 2461 Int4 aln_pos; 2462 Int4 first_related_seq; 2463 } FrameShiftReportData, PNTR FrameShiftReportPtr; 2464 2465 NLM_EXTERN ValNodePtr FrameShiftReportListFree (ValNodePtr vnp); 2466 NLM_EXTERN void PrintFrameShiftReportList (ValNodePtr list, Boolean has_exons, Boolean print_exons_only, LogInfoPtr lip); 2467 NLM_EXTERN ValNodePtr FindFrameShiftsInAlignment (SeqAlignPtr salp, BoolPtr has_exons); 2468 NLM_EXTERN Boolean PropagateMissingOldNames (ValNodePtr sep_list); 2469 NLM_EXTERN CharPtr DescribeBioSourceDifferences (BioSourcePtr biop1, BioSourcePtr biop2); 2470 NLM_EXTERN CharPtr DescribeStructuredCommentDifferences (UserObjectPtr uop1, UserObjectPtr uop2); 2471 NLM_EXTERN Boolean RemoveDuplicateStructuredCommentsInSeqEntry (SeqEntryPtr sep); 2472 NLM_EXTERN ValNodePtr GetSUCCommonList (SeqEntryPtr sep, Boolean reverse, Boolean byblock, Boolean showsequence, Boolean byqual); 2473 2474 NLM_EXTERN ValNodePtr LookupArticlesWithEutils (ValNodePtr orig_pub, LogInfoPtr lip); 2475 NLM_EXTERN Int4 LookupPubsInSeqEntry (SeqEntryPtr sep, LogInfoPtr lip); 2476 2477 NLM_EXTERN void LogTrimmedLocation (LogInfoPtr lip, SeqLocPtr slp); 2478 2479 NLM_EXTERN void AddListOutputTags(ValNodePtr discrepancy_list, DiscReportOutputConfigPtr oc); 2480 NLM_EXTERN Boolean IsMrnaSequence (BioseqPtr bsp); 2481 NLM_EXTERN BioseqPtr BioseqFromAlignmentID (CharPtr PNTR id_str); 2482 2483 NLM_EXTERN Int4 TrimNsFromNucsInSeqEntry (SeqEntryPtr sep, LogInfoPtr lip); 2484 NLM_EXTERN void CorrectGenCodes (SeqEntryPtr sep, Uint2 entityID); 2485 NLM_EXTERN int CompareUserFields (UserFieldPtr ufp1, UserFieldPtr ufp2); 2486 2487 NLM_EXTERN void RemoveEmptyStructuredComments (Uint2 entityID); 2488 2489 NLM_EXTERN Boolean IsStructuredCommentPrefix (UserFieldPtr ufp); 2490 NLM_EXTERN Boolean IsStructuredCommentSuffix (UserFieldPtr ufp); 2491 NLM_EXTERN CharPtr GetStructuredCommentPrefix (UserObjectPtr uop); 2492 2493 2494 typedef struct fielddiff { 2495 ValNodePtr field; 2496 CharPtr seq_id; 2497 CharPtr biosample_id; 2498 CharPtr val1; 2499 CharPtr val2; 2500 ValNodePtr src; 2501 } FieldDiffData, PNTR FieldDiffPtr; 2502 2503 NLM_EXTERN FieldDiffPtr FieldDiffFree (FieldDiffPtr diff); 2504 NLM_EXTERN ValNodePtr LIBCALL FieldDiffListFree (ValNodePtr list); 2505 NLM_EXTERN ValNodePtr GetBioSourceFieldDiffs (CharPtr seq_id, CharPtr biosample_id, BioSourcePtr biop1, BioSourcePtr biop2, ValNodePtr field_list, Uint1 src_type, Pointer src_data); 2506 NLM_EXTERN ValNodePtr GetStructuredCommentFieldDiffs (CharPtr seq_id, CharPtr biosample_id, UserObjectPtr uop1, UserObjectPtr uop2, ValNodePtr field_list, Uint1 src_type, Pointer src_data); 2507 NLM_EXTERN int LIBCALLBACK SortVnpByFieldDiffField (VoidPtr ptr1, VoidPtr ptr2); 2508 NLM_EXTERN int LIBCALLBACK SortVnpByFieldDiffBioIdThenField (VoidPtr ptr1, VoidPtr ptr2); 2509 NLM_EXTERN int LIBCALLBACK SortVnpByFieldDiffBiosampleIdThenFieldThenVal (VoidPtr ptr1, VoidPtr ptr2); 2510 2511 NLM_EXTERN Boolean FindFlankingGenes (SeqLocPtr location, SeqFeatPtr PNTR firstP, SeqFeatPtr PNTR lastP); 2512 NLM_EXTERN void AssignGeneXrefToFeat (SeqFeatPtr sfp, SeqFeatPtr gene); 2513 2514 2515 /* for cleanup of BioSources */ 2516 NLM_EXTERN void ConsolidateBioSourceNotes (BioSourcePtr biop); 2517 NLM_EXTERN void ConsolidateOneLikeOrganismModifier (OrgModPtr match_to, Boolean use_semicolon); 2518 NLM_EXTERN void ConsolidateOneLikeSubSourceModifier (SubSourcePtr match_to, Boolean use_semicolon); 2519 2520 #define kAllowManualGenCodeException "genetic code exception" 2521 2522 2523 NLM_EXTERN Boolean ReplaceStopsWithSelenocysteineInSeqEntry (SeqEntryPtr sep, FILE *log_fp); 2524 NLM_EXTERN Boolean JoinShortTrnas (SeqEntryPtr sep, FILE *log_fp); 2525 2526 NLM_EXTERN Boolean IsDBLinkObject (UserObjectPtr uop); 2527 2528 typedef struct gaplocdata { 2529 Int4 start; 2530 Int4 length; 2531 CharPtr estimated_length; 2532 CharPtr gap_type; 2533 CharPtr linkage_evidence; 2534 Boolean unknown_length; 2535 } GapLocData, PNTR GapLocPtr; 2536 2537 2538 void PopulateGapLocQuals(GapLocPtr glp, SeqFeatPtr sfp, Int4 left, Int4 len); 2539 GapLocPtr GapLocFromSeqFeat(SeqFeatPtr sfp, Int4 left); 2540 Boolean IncompatibleGapFeatQuals (SeqFeatPtr sfp); 2541 void BioseqToDeltaByGapFeat (BioseqPtr bsp, Pointer userdata); 2542 void BioseqToDeltaMergeGapFeat (BioseqPtr bsp, Pointer userdata); 2543 Boolean DeltaLitOnly (BioseqPtr bsp); 2544 Boolean MergeAssemblyGapFeats (BioseqPtr bsp); 2545 2546 NLM_EXTERN Boolean IsRegulatorySubtype (Uint1 key); 2547 2548 2549 #ifdef __cplusplus 2550 } 2551 #endif 2552 2553 #undef NLM_EXTERN 2554 #ifdef NLM_EXPORT 2555 #define NLM_EXTERN NLM_EXPORT 2556 #else 2557 #define NLM_EXTERN 2558 #endif 2559 2560 #endif /* ndef _SQNUTILS_ */ 2561 2562