1// =========================================================================== 2// 3// PUBLIC DOMAIN NOTICE 4// National Center for Biotechnology Information (NCBI) 5// 6// This software/database is a "United States Government Work" under the 7// terms of the United States Copyright Act. It was written as part of 8// the author's official duties as a United States Government employee and 9// thus cannot be copyrighted. This software/database is freely available 10// to the public for use. The National Library of Medicine and the U.S. 11// Government do not place any restriction on its use or reproduction. 12// We would, however, appreciate having the NCBI and the author cited in 13// any work or product based on this material. 14// 15// Although all reasonable efforts have been taken to ensure the accuracy 16// and reliability of the software and data, the NLM and the U.S. 17// Government do not and cannot warrant the performance or results that 18// may be obtained by using this software or data. The NLM and the U.S. 19// Government disclaim all warranties, express or implied, including 20// warranties of performance, merchantability or fitness for any particular 21// purpose. 22// 23// =========================================================================== 24// 25// File Name: transmute.go 26// 27// Author: Jonathan Kans 28// 29// ========================================================================== 30 31package main 32 33import ( 34 "encoding/base64" 35 "eutils" 36 "fmt" 37 "html" 38 "io" 39 "io/ioutil" 40 "net/url" 41 "os" 42 "runtime" 43 "runtime/debug" 44 "runtime/pprof" 45 "strconv" 46 "strings" 47 "sync" 48 "unicode" 49) 50 51// TRANSMUTE HELP MESSAGE TEXT 52 53const transmuteHelp = ` 54Pretty-Printing 55 56 Reformat XML 57 58 -x2p 59 60 Reformat JSON 61 62 -j2p 63 64 Table column alignment 65 66 -align 67 68 -a Column alignment codes: 69 70 l left 71 c center 72 r right 73 n numeric align on decimal point 74 N trailing zero-pad decimals 75 z leading zero-pad integers 76 77 -g Spacing between columns 78 -h Indent before columns 79 80Data Conversion 81 82 JSON stream to XML 83 84 -j2x 85 86 -set setWrapper 87 -rec recordWrapper 88 -nest [flat|recurse|plural|depth] 89 90 ASN.1 stream to XML 91 92 -a2x 93 94 -set setWrapper 95 -rec recordWrapper 96 97 Tab-delimited table to XML 98 99 -t2x 100 101 -set setWrapper 102 -rec recordWrapper 103 -skip linesToSkip 104 -header 105 -lower | -upper 106 -indent | -flush 107 108 XML object names per column 109 110 Comma-separated values file to XML 111 112 -c2x 113 114 -set setWrapper 115 -rec recordWrapper 116 -skip linesToSkip 117 -header 118 -lower | -upper 119 -indent | -flush 120 121 XML object names per column 122 123 GenBank/GenPept flatfile to INSDSeq XML 124 125 -g2x 126 127Sequence Comparison 128 129 -diff Compare two aligned files for point differences 130 131Sequence Editing 132 133 -revcomp Reverse complement nucleotide sequence 134 135 -remove Trim at ends of sequence 136 137 -first Delete first N bases 138 -last Delete last N bases 139 140 -retain Save either end of sequence 141 142 -leading Keep first N bases 143 -trailing Keep last N bases 144 145 -replace Apply base or residue substitution 146 147 -offset Skip ahead by 0-based count (SPDI), or 148 -column Move just before 1-based position (HGVS) 149 150 -delete Delete N bases 151 -insert Insert given sequence 152 153 -lower Lower-case original sequence 154 155 -extract Use xtract -insd feat_location instructions 156 157 -lower Lower-case extracted sequence 158 159Sequence Processing 160 161 -cds2prot Translate coding region into protein 162 163 -code Genetic code 164 -frame Offset in sequence 165 -stop Include stop residue 166 -trim Remove trailing Xs 167 -part5 CDS partial at 5' end 168 -part3 CDS extends past 3' end 169 -every Translate all codons 170 171 -molwt Calculate molecular weight of peptide 172 173 -met Do not cleave leading methionine 174 175Variation Processing 176 177 -hgvs Convert HGVS variation format to XML 178 179String Transformations 180 181 XML 182 183 -encodeXML 184 -decodeXML 185 186 -plainXML 187 188 URL 189 190 -encodeURL 191 -decodeURL 192 193 Base64 194 195 -encode64 196 -decode64 197 198 Protein 199 200 -aa1to3 201 -aa3to1 202 203Customized XML Reformatting 204 205 -format [compact|flush|indent|expand] 206 207 -xml 208 -doctype 209 -comment 210 -cdata 211 -separate 212 -self 213 -unicode [fuse|space|period|brackets|markdown|slash|tag] 214 -script [brackets|markdown] 215 -mathml [terse] 216 217XML Modification 218 219 -filter Object 220 [retain|remove|encode|decode|shrink|expand|accent] 221 [content|cdata|comment|object|attributes|container] 222 223EFetch XML Normalization 224 225 -normalize [database] 226 227Examples 228 229 -j2x -set - -rec GeneRec 230 231 -t2x -set Set -rec Rec -skip 1 Code Name 232 233 -filter ExpXml decode content 234 235 -filter LocationHist remove object 236 237 -normalize pubmed 238 239 -wrp PubmedArticleSet -pattern PubmedArticle -format 240 241Sequence Substitution 242 243 echo ATGAAACCCGGGTTTTAG | 244 transmute -replace -offset 5 -delete 1 -insert G 245 246Protein Translation 247 248 echo "CTAAAACCCGGGTTTCAT" | 249 transmute -revcomp | 250 transmute -cds2prot 251 252Variation Extraction 253 254 echo "NP_000504.1:p.Glu41Lys,NP_000504.1:p.P43Leu,NP_000504.1:p.Trp142Ter" | 255 transmute -hgvs | transmute -format 256 257Sequence Comparison 258 259 transmute -diff <( echo "MKPGSQPVIY" ) <( echo "-KPGFQ*VIY" ) 260 261Translation of Coding Regions 262 263 efetch -db nuccore -id U54469 -format gb | 264 transmute -g2x | 265 xtract -insd CDS sub_sequence | 266 cut -f 2 | 267 while read seq 268 do 269 echo "$seq" | 270 transmute -cds2prot 271 echo "" 272 done 273 274Mitochondrial Mistranslation 275 276 efetch -db nuccore -id NC_012920 -format gb | 277 transmute -g2x | 278 xtract -insd CDS gene product protein_id translation sub_sequence | 279 while IFS=$'\t' read acc gene prod prid prot seq 280 do 281 mito=$( echo "$seq" | transmute -cds2prot -code 2 -stop ) 282 norm=$( echo "$seq" | transmute -cds2prot -code 1 -stop ) 283 if [ "$mito" != "$norm" ] 284 then 285 echo ">$acc $gene $prid $prod" 286 transmute -diff <( echo "$mito" ) <( echo "$norm" ) 287 echo "" 288 fi 289 done 290 291Systematic Mutations 292 293 echo ATGAAACCCGGGTTTTAG | 294 while read seq 295 do 296 for (( i=0; i<${#seq}; i++ )) 297 do 298 ch="${seq:$i:1}" 299 for sub in A C G T 300 do 301 echo "$seq" | 302 transmute -replace -offset "$i" -delete "$ch" -insert "$sub" 303 done 304 done 305 done | 306 while read seq 307 do 308 tns=$( echo "$seq" | transmute -cds2prot ) 309 mwt=$( echo "$tns" | transmute -molwt ) 310 echo -e "${seq}\t${tns}\t${mwt}" 311 done 312` 313 314const transmuteExtra = ` 315Mismatch Detection (RefSeq Proteins with 3 Residue Differences from RefSeq Genome) 316 317 esearch -db gene -query "DMD [GENE] AND human [ORGN]" | 318 efetch -format docsum | 319 xtract -pattern DocumentSummary -block GenomicInfoType \ 320 -tab "\n" -element ChrAccVer,ChrStart,ChrStop | 321 xargs -n 3 sh -c 'efetch -db nuccore -format gbc \ 322 -id "$0" -chr_start "$1" -chr_stop "$2"' > dystrophin.xml 323 324 cat dystrophin.xml | 325 xtract -insd CDS gene product translation sub_sequence > dystrophin.txt 326 327 cat dystrophin.txt | 328 while IFS=$'\t' read acc gene prod prot seq 329 do 330 trans=$( echo "$seq" | transmute -cds2prot ) 331 if [ "$prot" != "$trans" ] 332 then 333 echo ">$acc $gene $prod" 334 transmute -diff <( echo "$prot" ) <( echo "$trans" ) 335 echo "" 336 fi 337 done > failures.txt 338` 339 340// XML FORMATTING FUNCTIONS 341 342// createFormatters does concurrent reformatting, using flush-left to remove leading spaces 343func createFormatters(parent string, format string, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { 344 345 if inp == nil { 346 return nil 347 } 348 349 out := make(chan eutils.XMLRecord, eutils.ChanDepth()) 350 if out == nil { 351 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create formatter channel\n") 352 os.Exit(1) 353 } 354 355 if format == "" { 356 format = "flush" 357 } 358 359 // xmlFormatter reads partitioned XML from channel and formats on a per-record basis 360 xmlFormatter := func(wg *sync.WaitGroup, parent string, inp <-chan eutils.XMLRecord, out chan<- eutils.XMLRecord) { 361 362 // report when this formatter has no more records to process 363 defer wg.Done() 364 365 // read partitioned XML from producer channel 366 for ext := range inp { 367 368 idx := ext.Index 369 text := ext.Text 370 371 if text == "" { 372 // should never see empty input data 373 out <- eutils.XMLRecord{Index: idx, Text: text} 374 continue 375 } 376 377 // str := doFormat(text[:], parent) 378 379 frm := eutils.FormatRecord(text, parent, eutils.FormatArgs{Format: format}) 380 str := eutils.ChanToString(frm) 381 382 // send even if empty to get all record counts for reordering 383 out <- eutils.XMLRecord{Index: idx, Text: str} 384 } 385 } 386 387 var wg sync.WaitGroup 388 389 // launch multiple formatter goroutines 390 for i := 0; i < eutils.NumServe(); i++ { 391 wg.Add(1) 392 go xmlFormatter(&wg, parent, inp, out) 393 } 394 395 // launch separate anonymous goroutine to wait until all formatters are done 396 go func() { 397 wg.Wait() 398 close(out) 399 }() 400 401 return out 402} 403 404// processFormat reformats XML for ease of reading 405func processFormat(rdr <-chan eutils.XMLBlock, args []string) { 406 407 if rdr == nil || args == nil { 408 return 409 } 410 411 // skip past command name 412 args = args[1:] 413 414 format := "" 415 xml := "" 416 doctype := "" 417 418 doSeparate := true 419 doSelf := false 420 doComment := false 421 doCdata := false 422 423 if len(args) > 0 { 424 // look for [compact|flush|indent|expand] specification 425 format = args[0] 426 if strings.HasPrefix(format, "-") { 427 // ran into next argument, default to indent 428 format = "indent" 429 } else { 430 // skip past first argument 431 args = args[1:] 432 } 433 } else { 434 format = "indent" 435 } 436 437 // look for remaining arguments 438 for len(args) > 0 { 439 440 switch args[0] { 441 case "-xml": 442 args = args[1:] 443 // -xml argument must be followed by value to use in xml line 444 if len(args) < 1 || strings.HasPrefix(args[0], "-") { 445 fmt.Fprintf(os.Stderr, "\nERROR: -xml argument is missing\n") 446 os.Exit(1) 447 } 448 xml = args[0] 449 args = args[1:] 450 case "-doctype": 451 args = args[1:] 452 if len(args) > 0 { 453 // if -doctype argument followed by value, use instead of DOCTYPE line 454 doctype = args[0] 455 args = args[1:] 456 } 457 /* 458 // allow setting of unicode, script, and mathml flags within -format 459 case "-unicode": 460 if len(args) < 2 { 461 fmt.Fprintf(os.Stderr, "\nERROR: Unicode argument is missing\n") 462 os.Exit(1) 463 } 464 // unicodePolicy = args[1] 465 args = args[2:] 466 case "-script": 467 if len(args) < 2 { 468 fmt.Fprintf(os.Stderr, "\nERROR: Script argument is missing\n") 469 os.Exit(1) 470 } 471 // scriptPolicy = args[1] 472 args = args[2:] 473 case "-mathml": 474 if len(args) < 2 { 475 fmt.Fprintf(os.Stderr, "\nERROR: MathML argument is missing\n") 476 os.Exit(1) 477 } 478 // mathmlPolicy = args[1] 479 args = args[2:] 480 */ 481 482 // also allow setting additional processing flags within -format (undocumented) 483 case "-separate", "-separated": 484 doSeparate = false 485 args = args[1:] 486 case "-self", "-self-closing": 487 doSelf = true 488 args = args[1:] 489 case "-comment": 490 doComment = true 491 args = args[1:] 492 case "-cdata": 493 doCdata = true 494 args = args[1:] 495 default: 496 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -format command\n") 497 os.Exit(1) 498 } 499 } 500 501 tknq := eutils.CreateTokenizer(rdr) 502 503 frgs := eutils.FormatArgs{ 504 Format: format, XML: xml, Doctype: doctype, 505 Separate: doSeparate, Self: doSelf, 506 Comment: doComment, Cdata: doCdata} 507 508 frm := eutils.FormatTokens(tknq, frgs) 509 510 eutils.ChanToStdout(frm) 511} 512 513// processTokens shows individual tokens in stream (undocumented) 514func processTokens(rdr <-chan eutils.XMLBlock) { 515 516 if rdr == nil { 517 return 518 } 519 520 tknq := eutils.CreateTokenizer(rdr) 521 522 if tknq == nil { 523 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create debug tokenizer\n") 524 os.Exit(1) 525 } 526 527 var buffer strings.Builder 528 529 count := 0 530 indent := 0 531 532 for tkn := range tknq { 533 534 tag := tkn.Tag 535 name := tkn.Name 536 attr := tkn.Attr 537 538 switch tag { 539 case eutils.STARTTAG: 540 buffer.WriteString("ST: ") 541 for i := 0; i < indent; i++ { 542 buffer.WriteString(" ") 543 } 544 buffer.WriteString(name) 545 buffer.WriteString("\n") 546 if attr != "" { 547 buffer.WriteString("AT: ") 548 for i := 0; i < indent; i++ { 549 buffer.WriteString(" ") 550 } 551 buffer.WriteString(attr) 552 buffer.WriteString("\n") 553 } 554 indent++ 555 case eutils.SELFTAG: 556 buffer.WriteString("SL: ") 557 for i := 0; i < indent; i++ { 558 buffer.WriteString(" ") 559 } 560 buffer.WriteString(name) 561 buffer.WriteString("/") 562 buffer.WriteString("\n") 563 if attr != "" { 564 buffer.WriteString("AT: ") 565 for i := 0; i < indent; i++ { 566 buffer.WriteString(" ") 567 } 568 buffer.WriteString(attr) 569 buffer.WriteString("\n") 570 } 571 case eutils.STOPTAG: 572 indent-- 573 buffer.WriteString("SP: ") 574 for i := 0; i < indent; i++ { 575 buffer.WriteString(" ") 576 } 577 buffer.WriteString(name) 578 buffer.WriteString("/") 579 buffer.WriteString("\n") 580 case eutils.CONTENTTAG: 581 ctype := tkn.Cont 582 if (ctype & eutils.LFTSPACE) != 0 { 583 if (ctype & eutils.RGTSPACE) != 0 { 584 buffer.WriteString("FL: ") 585 } else { 586 buffer.WriteString("LF: ") 587 } 588 } else if (ctype & eutils.RGTSPACE) != 0 { 589 buffer.WriteString("RT: ") 590 } else { 591 buffer.WriteString("VL: ") 592 } 593 for i := 0; i < indent; i++ { 594 buffer.WriteString(" ") 595 } 596 buffer.WriteString(name) 597 buffer.WriteString("\n") 598 case eutils.CDATATAG: 599 buffer.WriteString("CD: ") 600 for i := 0; i < indent; i++ { 601 buffer.WriteString(" ") 602 } 603 buffer.WriteString(name) 604 buffer.WriteString("\n") 605 case eutils.COMMENTTAG: 606 buffer.WriteString("CO: ") 607 for i := 0; i < indent; i++ { 608 buffer.WriteString(" ") 609 } 610 buffer.WriteString(name) 611 buffer.WriteString("\n") 612 case eutils.DOCTYPETAG: 613 buffer.WriteString("DC: ") 614 for i := 0; i < indent; i++ { 615 buffer.WriteString(" ") 616 } 617 buffer.WriteString(name) 618 buffer.WriteString("\n") 619 case eutils.NOTAG: 620 buffer.WriteString("NO:") 621 if indent != 0 { 622 buffer.WriteString(" (indent ") 623 buffer.WriteString(strconv.Itoa(indent)) 624 buffer.WriteString(")") 625 } 626 buffer.WriteString("\n") 627 case eutils.ISCLOSED: 628 buffer.WriteString("CL:") 629 if indent != 0 { 630 buffer.WriteString(" (indent ") 631 buffer.WriteString(strconv.Itoa(indent)) 632 buffer.WriteString(")") 633 } 634 buffer.WriteString("\n") 635 txt := buffer.String() 636 if txt != "" { 637 // print final buffer 638 fmt.Fprintf(os.Stdout, "%s", txt) 639 } 640 return 641 default: 642 buffer.WriteString("UNKONWN:") 643 if indent != 0 { 644 buffer.WriteString(" (indent ") 645 buffer.WriteString(strconv.Itoa(indent)) 646 buffer.WriteString(")") 647 } 648 buffer.WriteString("\n") 649 } 650 651 count++ 652 if count > 1000 { 653 count = 0 654 txt := buffer.String() 655 if txt != "" { 656 // print current buffered output 657 fmt.Fprintf(os.Stdout, "%s", txt) 658 } 659 buffer.Reset() 660 } 661 } 662} 663 664// processOutline displays outline of XML structure 665func processOutline(rdr <-chan eutils.XMLBlock) { 666 667 if rdr == nil { 668 return 669 } 670 671 tknq := eutils.CreateTokenizer(rdr) 672 673 if tknq == nil { 674 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create outline tokenizer\n") 675 os.Exit(1) 676 } 677 678 var buffer strings.Builder 679 680 count := 0 681 indent := 0 682 683 for tkn := range tknq { 684 685 tag := tkn.Tag 686 name := tkn.Name 687 688 switch tag { 689 case eutils.STARTTAG: 690 if name == "eSummaryResult" || 691 name == "eLinkResult" || 692 name == "eInfoResult" || 693 name == "PubmedArticleSet" || 694 name == "DocumentSummarySet" || 695 name == "INSDSet" || 696 name == "Entrezgene-Set" || 697 name == "TaxaSet" { 698 break 699 } 700 for i := 0; i < indent; i++ { 701 buffer.WriteString(" ") 702 } 703 buffer.WriteString(name) 704 buffer.WriteString("\n") 705 indent++ 706 case eutils.SELFTAG: 707 for i := 0; i < indent; i++ { 708 buffer.WriteString(" ") 709 } 710 buffer.WriteString(name) 711 buffer.WriteString("\n") 712 case eutils.STOPTAG: 713 indent-- 714 case eutils.DOCTYPETAG: 715 case eutils.NOTAG: 716 case eutils.ISCLOSED: 717 txt := buffer.String() 718 if txt != "" { 719 // print final buffer 720 fmt.Fprintf(os.Stdout, "%s", txt) 721 } 722 return 723 default: 724 } 725 726 count++ 727 if count > 1000 { 728 count = 0 729 txt := buffer.String() 730 if txt != "" { 731 // print current buffered output 732 fmt.Fprintf(os.Stdout, "%s", txt) 733 } 734 buffer.Reset() 735 } 736 } 737} 738 739// processSynopsis displays paths to XML elements 740func processSynopsis(rdr <-chan eutils.XMLBlock, leaf bool, delim string) { 741 742 if rdr == nil { 743 return 744 } 745 746 tknq := eutils.CreateTokenizer(rdr) 747 748 if tknq == nil { 749 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create synopsis tokenizer\n") 750 os.Exit(1) 751 } 752 753 var buffer strings.Builder 754 count := 0 755 756 // synopsisLevel recursive definition 757 var synopsisLevel func(string) bool 758 759 synopsisLevel = func(parent string) bool { 760 761 for tkn := range tknq { 762 763 tag := tkn.Tag 764 name := tkn.Name 765 766 switch tag { 767 case eutils.STARTTAG: 768 if name == "eSummaryResult" || 769 name == "eLinkResult" || 770 name == "eInfoResult" || 771 name == "PubmedArticleSet" || 772 name == "DocumentSummarySet" || 773 name == "INSDSet" || 774 name == "Entrezgene-Set" || 775 name == "TaxaSet" { 776 break 777 } 778 if leaf { 779 if name == "root" || 780 name == "opt" || 781 name == "anon" { 782 break 783 } 784 } 785 if !leaf { 786 // show all paths, including container objects 787 if parent != "" { 788 buffer.WriteString(parent) 789 buffer.WriteString(delim) 790 } 791 buffer.WriteString(name) 792 buffer.WriteString("\n") 793 } 794 path := parent 795 if path != "" { 796 path += delim 797 } 798 path += name 799 if synopsisLevel(path) { 800 return true 801 } 802 case eutils.SELFTAG: 803 if parent != "" { 804 buffer.WriteString(parent) 805 buffer.WriteString(delim) 806 } 807 buffer.WriteString(name) 808 buffer.WriteString("\n") 809 case eutils.STOPTAG: 810 // break recursion 811 return false 812 case eutils.CONTENTTAG: 813 if leaf { 814 // only show endpoint paths 815 if parent != "" { 816 buffer.WriteString(parent) 817 buffer.WriteString("\n") 818 } 819 } 820 case eutils.DOCTYPETAG: 821 case eutils.NOTAG: 822 case eutils.ISCLOSED: 823 txt := buffer.String() 824 if txt != "" { 825 // print final buffer 826 fmt.Fprintf(os.Stdout, "%s", txt) 827 } 828 return true 829 default: 830 } 831 832 count++ 833 if count > 1000 { 834 count = 0 835 txt := buffer.String() 836 if txt != "" { 837 // print current buffered output 838 fmt.Fprintf(os.Stdout, "%s", txt) 839 } 840 buffer.Reset() 841 } 842 } 843 return true 844 } 845 846 for { 847 // may have concatenated XMLs, loop through all 848 if synopsisLevel("") { 849 return 850 } 851 } 852} 853 854// processFilter modifies XML content, comments, or CDATA 855func processFilter(rdr <-chan eutils.XMLBlock, args []string) { 856 857 if rdr == nil || args == nil { 858 return 859 } 860 861 tknq := eutils.CreateTokenizer(rdr) 862 863 if tknq == nil { 864 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create filter tokenizer\n") 865 os.Exit(1) 866 } 867 868 var buffer strings.Builder 869 870 count := 0 871 872 // skip past command name 873 args = args[1:] 874 875 max := len(args) 876 if max < 1 { 877 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to transmute -filter\n") 878 os.Exit(1) 879 } 880 881 pttrn := args[0] 882 883 args = args[1:] 884 max-- 885 886 if max < 2 { 887 fmt.Fprintf(os.Stderr, "\nERROR: No object name supplied to transmute -filter\n") 888 os.Exit(1) 889 } 890 891 type ActionType int 892 893 const ( 894 NOACTION ActionType = iota 895 DORETAIN 896 DOREMOVE 897 DOENCODE 898 DODECODE 899 DOSHRINK 900 DOEXPAND 901 DOACCENT 902 ) 903 904 action := args[0] 905 906 what := NOACTION 907 switch action { 908 case "retain": 909 what = DORETAIN 910 case "remove": 911 what = DOREMOVE 912 case "encode": 913 what = DOENCODE 914 case "decode": 915 what = DODECODE 916 case "shrink": 917 what = DOSHRINK 918 case "expand": 919 what = DOEXPAND 920 case "accent": 921 what = DOACCENT 922 default: 923 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized action '%s' supplied to transmute -filter\n", action) 924 os.Exit(1) 925 } 926 927 trget := args[1] 928 929 which := eutils.NOTAG 930 switch trget { 931 case "attribute", "attributes": 932 which = eutils.ATTRIBTAG 933 case "content", "contents": 934 which = eutils.CONTENTTAG 935 case "cdata", "CDATA": 936 which = eutils.CDATATAG 937 case "comment", "comments": 938 which = eutils.COMMENTTAG 939 case "object": 940 // object normally retained 941 which = eutils.OBJECTTAG 942 case "container": 943 which = eutils.CONTAINERTAG 944 default: 945 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized target '%s' supplied to transmute -filter\n", trget) 946 os.Exit(1) 947 } 948 949 inPattern := false 950 prevName := "" 951 952 for tkn := range tknq { 953 954 tag := tkn.Tag 955 name := tkn.Name 956 attr := tkn.Attr 957 958 switch tag { 959 case eutils.STARTTAG: 960 prevName = name 961 if name == pttrn { 962 inPattern = true 963 if which == eutils.CONTAINERTAG && what == DOREMOVE { 964 continue 965 } 966 } 967 if inPattern && which == eutils.OBJECTTAG && what == DOREMOVE { 968 continue 969 } 970 buffer.WriteString("<") 971 buffer.WriteString(name) 972 if attr != "" { 973 if which != eutils.ATTRIBTAG || what != DOREMOVE { 974 attr = strings.TrimSpace(attr) 975 attr = eutils.CompressRunsOfSpaces(attr) 976 buffer.WriteString(" ") 977 buffer.WriteString(attr) 978 } 979 } 980 buffer.WriteString(">\n") 981 case eutils.SELFTAG: 982 if inPattern && which == eutils.OBJECTTAG && what == DOREMOVE { 983 continue 984 } 985 buffer.WriteString("<") 986 buffer.WriteString(name) 987 if attr != "" { 988 if which != eutils.ATTRIBTAG || what != DOREMOVE { 989 attr = strings.TrimSpace(attr) 990 attr = eutils.CompressRunsOfSpaces(attr) 991 buffer.WriteString(" ") 992 buffer.WriteString(attr) 993 } 994 } 995 buffer.WriteString("/>\n") 996 case eutils.STOPTAG: 997 if name == pttrn { 998 inPattern = false 999 if which == eutils.OBJECTTAG && what == DOREMOVE { 1000 continue 1001 } 1002 if which == eutils.CONTAINERTAG && what == DOREMOVE { 1003 continue 1004 } 1005 } 1006 if inPattern && which == eutils.OBJECTTAG && what == DOREMOVE { 1007 continue 1008 } 1009 buffer.WriteString("</") 1010 buffer.WriteString(name) 1011 buffer.WriteString(">\n") 1012 case eutils.CONTENTTAG: 1013 if inPattern && which == eutils.OBJECTTAG && what == DOREMOVE { 1014 continue 1015 } 1016 if inPattern && which == eutils.CONTENTTAG && what == DOEXPAND { 1017 var words []string 1018 if strings.Contains(name, "|") { 1019 words = strings.FieldsFunc(name, func(c rune) bool { 1020 return c == '|' 1021 }) 1022 } else if strings.Contains(name, ",") { 1023 words = strings.FieldsFunc(name, func(c rune) bool { 1024 return c == ',' 1025 }) 1026 } else { 1027 words = strings.Fields(name) 1028 } 1029 between := "" 1030 for _, item := range words { 1031 max := len(item) 1032 for max > 1 { 1033 ch := item[max-1] 1034 if ch != '.' && ch != ',' && ch != ':' && ch != ';' { 1035 break 1036 } 1037 // trim trailing punctuation 1038 item = item[:max-1] 1039 // continue checking for runs of punctuation at end 1040 max-- 1041 } 1042 if eutils.HasFlankingSpace(item) { 1043 item = strings.TrimSpace(item) 1044 } 1045 if item != "" { 1046 if between != "" { 1047 buffer.WriteString(between) 1048 } 1049 buffer.WriteString(item) 1050 buffer.WriteString("\n") 1051 between = "</" + prevName + ">\n<" + prevName + ">\n" 1052 } 1053 } 1054 continue 1055 } 1056 if inPattern && which == tag { 1057 switch what { 1058 case DORETAIN: 1059 // default behavior for content - can use -filter X retain content as a no-op 1060 case DOREMOVE: 1061 continue 1062 case DOENCODE: 1063 name = html.EscapeString(name) 1064 case DODECODE: 1065 name = html.UnescapeString(name) 1066 case DOSHRINK: 1067 name = eutils.CompressRunsOfSpaces(name) 1068 case DOACCENT: 1069 if eutils.IsNotASCII(name) { 1070 name = eutils.DoAccentTransform(name) 1071 } 1072 default: 1073 continue 1074 } 1075 } 1076 // content normally printed 1077 if eutils.HasFlankingSpace(name) { 1078 name = strings.TrimSpace(name) 1079 } 1080 buffer.WriteString(name) 1081 buffer.WriteString("\n") 1082 case eutils.CDATATAG: 1083 if inPattern && which == eutils.OBJECTTAG && what == DOREMOVE { 1084 continue 1085 } 1086 if inPattern && which == tag { 1087 switch what { 1088 case DORETAIN: 1089 // cdata requires explicit retain command 1090 case DOREMOVE: 1091 continue 1092 case DOENCODE: 1093 name = html.EscapeString(name) 1094 case DODECODE: 1095 name = html.UnescapeString(name) 1096 case DOSHRINK: 1097 name = eutils.CompressRunsOfSpaces(name) 1098 case DOACCENT: 1099 if eutils.IsNotASCII(name) { 1100 name = eutils.DoAccentTransform(name) 1101 } 1102 default: 1103 continue 1104 } 1105 // cdata normally removed 1106 if eutils.HasFlankingSpace(name) { 1107 name = strings.TrimSpace(name) 1108 } 1109 buffer.WriteString(name) 1110 buffer.WriteString("\n") 1111 } 1112 case eutils.COMMENTTAG: 1113 if inPattern && which == eutils.OBJECTTAG && what == DOREMOVE { 1114 continue 1115 } 1116 if inPattern && which == tag { 1117 switch what { 1118 case DORETAIN: 1119 // comment requires explicit retain command 1120 case DOREMOVE: 1121 continue 1122 case DOENCODE: 1123 name = html.EscapeString(name) 1124 case DODECODE: 1125 name = html.UnescapeString(name) 1126 case DOSHRINK: 1127 name = eutils.CompressRunsOfSpaces(name) 1128 case DOACCENT: 1129 if eutils.IsNotASCII(name) { 1130 name = eutils.DoAccentTransform(name) 1131 } 1132 default: 1133 continue 1134 } 1135 // comment normally removed 1136 if eutils.HasFlankingSpace(name) { 1137 name = strings.TrimSpace(name) 1138 } 1139 buffer.WriteString(name) 1140 buffer.WriteString("\n") 1141 } 1142 case eutils.DOCTYPETAG: 1143 case eutils.NOTAG: 1144 case eutils.ISCLOSED: 1145 txt := buffer.String() 1146 if txt != "" { 1147 // print final buffer 1148 fmt.Fprintf(os.Stdout, "%s", txt) 1149 } 1150 return 1151 default: 1152 } 1153 1154 count++ 1155 if count > 1000 { 1156 count = 0 1157 txt := buffer.String() 1158 if txt != "" { 1159 // print current buffered output 1160 fmt.Fprintf(os.Stdout, "%s", txt) 1161 } 1162 buffer.Reset() 1163 } 1164 } 1165} 1166 1167// STRING CONVERTERS 1168 1169func encodeURL(inp io.Reader) { 1170 1171 if inp == nil { 1172 return 1173 } 1174 1175 data, _ := ioutil.ReadAll(inp) 1176 txt := string(data) 1177 txt = strings.TrimSuffix(txt, "\n") 1178 1179 str := url.QueryEscape(txt) 1180 1181 os.Stdout.WriteString(str) 1182 if !strings.HasSuffix(str, "\n") { 1183 os.Stdout.WriteString("\n") 1184 } 1185} 1186 1187func decodeURL(inp io.Reader) { 1188 1189 if inp == nil { 1190 return 1191 } 1192 1193 byt, _ := ioutil.ReadAll(inp) 1194 txt := string(byt) 1195 txt = strings.TrimSuffix(txt, "\n") 1196 1197 str, _ := url.QueryUnescape(txt) 1198 1199 os.Stdout.WriteString(str) 1200 if !strings.HasSuffix(str, "\n") { 1201 os.Stdout.WriteString("\n") 1202 } 1203} 1204 1205func encodeB64(inp io.Reader) { 1206 1207 if inp == nil { 1208 return 1209 } 1210 1211 data, _ := ioutil.ReadAll(inp) 1212 1213 str := base64.StdEncoding.EncodeToString(data) 1214 1215 os.Stdout.WriteString(str) 1216 if !strings.HasSuffix(str, "\n") { 1217 os.Stdout.WriteString("\n") 1218 } 1219} 1220 1221func decodeB64(inp io.Reader) { 1222 1223 if inp == nil { 1224 return 1225 } 1226 1227 byt, _ := ioutil.ReadAll(inp) 1228 1229 data, _ := base64.StdEncoding.DecodeString(string(byt)) 1230 str := string(data) 1231 1232 os.Stdout.WriteString(str) 1233 if !strings.HasSuffix(str, "\n") { 1234 os.Stdout.WriteString("\n") 1235 } 1236} 1237 1238func decodeHGVS(inp io.Reader) { 1239 1240 if inp == nil { 1241 return 1242 } 1243 1244 byt, _ := ioutil.ReadAll(inp) 1245 txt := string(byt) 1246 1247 os.Stdout.WriteString("<HGVS>\n") 1248 1249 str := eutils.ParseHGVS(txt) 1250 1251 os.Stdout.WriteString(str) 1252 if !strings.HasSuffix(str, "\n") { 1253 os.Stdout.WriteString("\n") 1254 } 1255 1256 os.Stdout.WriteString("</HGVS>\n") 1257} 1258 1259// COLUMN ALIGNMENT FORMATTER 1260 1261// processAlign aligns a tab-delimited table by individual column widths 1262func processAlign(inp io.Reader, args []string) { 1263 1264 // tab-delimited-table to padded-by-spaces alignment inspired by 1265 // Steve Kinzler's align script - see http://kinzler.com/me/align/ 1266 1267 if inp == nil { 1268 return 1269 } 1270 1271 mrg := 0 1272 pdg := 0 1273 aln := "" 1274 1275 // skip past command name 1276 args = args[1:] 1277 1278 for len(args) > 0 { 1279 1280 switch args[0] { 1281 case "-g": 1282 pdg = eutils.GetNumericArg(args, "-g spacing between columns", 0, 1, 30) 1283 args = args[2:] 1284 case "-h": 1285 mrg = eutils.GetNumericArg(args, "-i indent before columns", 0, 1, 30) 1286 args = args[2:] 1287 case "-a": 1288 aln = eutils.GetStringArg(args, "-a column alignment code string") 1289 args = args[2:] 1290 default: 1291 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -align command\n") 1292 os.Exit(1) 1293 } 1294 } 1295 1296 algn := eutils.AlignColumns(inp, mrg, pdg, aln) 1297 1298 if algn == nil { 1299 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create alignment function\n") 1300 os.Exit(1) 1301 } 1302 1303 eutils.ChanToStdout(algn) 1304 1305 return 1306} 1307 1308// SEQUENCE EDITING 1309 1310func sequenceRemove(inp io.Reader, args []string) { 1311 1312 if inp == nil { 1313 return 1314 } 1315 1316 first := "" 1317 last := "" 1318 1319 // skip past command name 1320 args = args[1:] 1321 1322 for len(args) > 0 { 1323 1324 switch args[0] { 1325 case "-first": 1326 first = eutils.GetStringArg(args, "Bases to delete at beginning") 1327 first = strings.ToUpper(first) 1328 args = args[2:] 1329 case "-last": 1330 last = eutils.GetStringArg(args, "Bases to delete at end") 1331 last = strings.ToUpper(last) 1332 args = args[2:] 1333 default: 1334 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -remove command\n") 1335 os.Exit(1) 1336 } 1337 } 1338 1339 str := eutils.ReadAllIntoSequence(inp) 1340 1341 str = eutils.SequenceRemove(str, first, last) 1342 1343 os.Stdout.WriteString(str) 1344 if !strings.HasSuffix(str, "\n") { 1345 os.Stdout.WriteString("\n") 1346 } 1347} 1348 1349func sequenceRetain(inp io.Reader, args []string) { 1350 1351 if inp == nil { 1352 return 1353 } 1354 1355 lead := 0 1356 trail := 0 1357 1358 // skip past command name 1359 args = args[1:] 1360 1361 for len(args) > 0 { 1362 1363 switch args[0] { 1364 case "-leading": 1365 lead = eutils.GetNumericArg(args, "Bases to keep at beginning", 0, -1, -1) 1366 args = args[2:] 1367 case "-trailing": 1368 trail = eutils.GetNumericArg(args, "Bases to keep at end", 0, -1, -1) 1369 args = args[2:] 1370 default: 1371 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -retain command\n") 1372 os.Exit(1) 1373 } 1374 } 1375 1376 str := eutils.ReadAllIntoSequence(inp) 1377 1378 str = eutils.SequenceRetain(str, lead, trail) 1379 1380 os.Stdout.WriteString(str) 1381 if !strings.HasSuffix(str, "\n") { 1382 os.Stdout.WriteString("\n") 1383 } 1384} 1385 1386func sequenceReplace(inp io.Reader, args []string) { 1387 1388 if inp == nil { 1389 return 1390 } 1391 1392 pos := 0 1393 del := "" 1394 ins := "" 1395 lower := false 1396 1397 // skip past command name 1398 args = args[1:] 1399 1400 for len(args) > 0 { 1401 1402 switch args[0] { 1403 case "-offset": 1404 pos = eutils.GetNumericArg(args, "0-based position", 0, -1, -1) 1405 args = args[2:] 1406 case "-column": 1407 val := eutils.GetNumericArg(args, "1-based position", 1, -1, -1) 1408 pos = val - 1 1409 args = args[2:] 1410 case "-delete": 1411 del = eutils.GetStringArg(args, "Number to delete") 1412 del = strings.ToUpper(del) 1413 args = args[2:] 1414 case "-insert": 1415 ins = eutils.GetStringArg(args, "Bases to insert") 1416 ins = strings.ToUpper(ins) 1417 args = args[2:] 1418 case "-lower": 1419 lower = true 1420 args = args[1:] 1421 default: 1422 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -replace command\n") 1423 os.Exit(1) 1424 } 1425 } 1426 1427 str := eutils.ReadAllIntoSequence(inp) 1428 1429 if lower { 1430 str = strings.ToLower(str) 1431 } 1432 1433 str = eutils.SequenceReplace(str, pos, del, ins) 1434 1435 os.Stdout.WriteString(str) 1436 if !strings.HasSuffix(str, "\n") { 1437 os.Stdout.WriteString("\n") 1438 } 1439} 1440 1441func sequenceExtract(inp io.Reader, args []string) { 1442 1443 if inp == nil { 1444 return 1445 } 1446 1447 featLoc := "" 1448 lower := false 1449 1450 // skip past command name 1451 args = args[1:] 1452 1453 for len(args) > 0 { 1454 1455 switch args[0] { 1456 case "-lower": 1457 lower = true 1458 args = args[1:] 1459 default: 1460 // read output of xtract -insd feat_location qualifier 1461 featLoc = args[0] 1462 args = args[1:] 1463 } 1464 } 1465 1466 if featLoc == "" { 1467 fmt.Fprintf(os.Stderr, "\nERROR: Missing argument after -extract command\n") 1468 os.Exit(1) 1469 } 1470 1471 str := eutils.ReadAllIntoSequence(inp) 1472 1473 str = eutils.SequenceExtract(str, featLoc) 1474 1475 if lower { 1476 str = strings.ToLower(str) 1477 } 1478 1479 os.Stdout.WriteString(str) 1480 if !strings.HasSuffix(str, "\n") { 1481 os.Stdout.WriteString("\n") 1482 } 1483} 1484 1485// REVERSE SEQUENCE 1486 1487// seqFlip reverses without complementing - e.g., minus strand proteins translated in reverse order 1488func seqFlip(inp io.Reader) { 1489 1490 if inp == nil { 1491 return 1492 } 1493 1494 str := eutils.ReadAllIntoSequence(inp) 1495 1496 str = eutils.SequenceReverse(str) 1497 1498 os.Stdout.WriteString(str) 1499 if !strings.HasSuffix(str, "\n") { 1500 os.Stdout.WriteString("\n") 1501 } 1502} 1503 1504// REVERSE COMPLEMENT 1505 1506func nucRevComp(inp io.Reader) { 1507 1508 if inp == nil { 1509 return 1510 } 1511 1512 str := eutils.ReadAllIntoSequence(inp) 1513 1514 str = eutils.ReverseComplement(str) 1515 1516 os.Stdout.WriteString(str) 1517 if !strings.HasSuffix(str, "\n") { 1518 os.Stdout.WriteString("\n") 1519 } 1520} 1521 1522// FASTA DIFFERENCES 1523 1524func printFastaPairs(frst, scnd string) { 1525 1526 frst = strings.ToLower(frst) 1527 scnd = strings.ToLower(scnd) 1528 1529 fst := frst[:] 1530 scd := scnd[:] 1531 1532 // next functions return spaces after end of sequence 1533 nextF := func() rune { 1534 1535 if len(fst) < 1 { 1536 return ' ' 1537 } 1538 ch := fst[0] 1539 fst = fst[1:] 1540 1541 return rune(ch) 1542 } 1543 1544 nextS := func() rune { 1545 1546 if len(scd) < 1 { 1547 return ' ' 1548 } 1549 ch := scd[0] 1550 scd = scd[1:] 1551 1552 return rune(ch) 1553 } 1554 1555 var fs []rune 1556 var sc []rune 1557 mx := 0 1558 1559 // populate output arrays 1560 for { 1561 1562 f, s := nextF(), nextS() 1563 // if both spaces, end of both sequences 1564 if f == ' ' && s == ' ' { 1565 break 1566 } 1567 if f == s { 1568 fs = append(fs, f) 1569 sc = append(sc, ' ') 1570 } else { 1571 // show mismatches in upper case 1572 fs = append(fs, unicode.ToUpper(f)) 1573 sc = append(sc, unicode.ToUpper(s)) 1574 } 1575 mx++ 1576 } 1577 1578 // pad output to multiple of 50 1579 j := mx % 50 1580 if j > 0 { 1581 for j < 50 { 1582 fs = append(fs, ' ') 1583 sc = append(sc, ' ') 1584 j++ 1585 mx++ 1586 } 1587 } 1588 1589 // print in blocks of 50 bases or residues 1590 for i := 0; i < mx; i += 50 { 1591 dl := 50 1592 if mx-i < 50 { 1593 dl = mx - i 1594 } 1595 lf := fs[:dl] 1596 rt := sc[:dl] 1597 fs = fs[dl:] 1598 sc = sc[dl:] 1599 tm := strings.TrimRight(string(lf), " ") 1600 fmt.Fprintf(os.Stdout, "%s %6d\n%s\n", string(lf), i+len(tm), string(rt)) 1601 } 1602} 1603 1604func fastaDiff(inp io.Reader, args []string) { 1605 1606 if inp == nil { 1607 return 1608 } 1609 1610 // skip past command name 1611 args = args[1:] 1612 1613 if len(args) != 2 { 1614 fmt.Fprintf(os.Stderr, "\nERROR: Two files required by -diff command\n") 1615 os.Exit(1) 1616 } 1617 1618 frst := args[0] 1619 scnd := args[1] 1620 1621 frstFasta := eutils.ReadFromFileIntoSequence(frst) 1622 scndFasta := eutils.ReadFromFileIntoSequence(scnd) 1623 1624 if frstFasta == scndFasta { 1625 return 1626 } 1627 1628 // sequences are assumed to be aligned, this code highlight mismatches 1629 printFastaPairs(frstFasta, scndFasta) 1630} 1631 1632// PROTEIN WEIGHT 1633 1634func protWeight(inp io.Reader, args []string) { 1635 1636 if inp == nil { 1637 return 1638 } 1639 1640 trimLeadingMet := true 1641 1642 // skip past command name 1643 args = args[1:] 1644 1645 for len(args) > 0 { 1646 1647 switch args[0] { 1648 case "-met": 1649 trimLeadingMet = false 1650 args = args[1:] 1651 default: 1652 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -molwt command\n") 1653 os.Exit(1) 1654 } 1655 } 1656 1657 str := eutils.ReadAllIntoSequence(inp) 1658 1659 str = eutils.ProteinWeight(str, trimLeadingMet) 1660 1661 os.Stdout.WriteString(str) 1662 if !strings.HasSuffix(str, "\n") { 1663 os.Stdout.WriteString("\n") 1664 } 1665} 1666 1667// cdRegionToProtein reads all of stdin as sequence data 1668func cdRegionToProtein(inp io.Reader, args []string) { 1669 1670 if inp == nil { 1671 return 1672 } 1673 1674 genCode := 1 1675 frame := 0 1676 includeStop := false 1677 doEveryCodon := false 1678 removeTrailingX := false 1679 is5primeComplete := true 1680 is3primeComplete := true 1681 1682 repeat := 1 1683 1684 // skip past command name 1685 args = args[1:] 1686 1687 for len(args) > 0 { 1688 1689 switch args[0] { 1690 case "-code", "-gencode": 1691 genCode = eutils.GetNumericArg(args, "genetic code number", 0, 1, 30) 1692 args = args[2:] 1693 case "-frame": 1694 frame = eutils.GetNumericArg(args, "offset into coding sequence", 0, 1, 30) 1695 args = args[2:] 1696 case "-stop", "-stops": 1697 includeStop = true 1698 args = args[1:] 1699 case "-every", "-all": 1700 doEveryCodon = true 1701 args = args[1:] 1702 case "-trim", "-trailing": 1703 removeTrailingX = true 1704 args = args[1:] 1705 case "-part5", "-partial5", "-lt5": 1706 is5primeComplete = false 1707 args = args[1:] 1708 case "-part3", "-partial3", "-gt3": 1709 is3primeComplete = false 1710 args = args[1:] 1711 case "-repeat": 1712 repeat = eutils.GetNumericArg(args, "number of repetitions for testing", 1, 1, 100) 1713 args = args[2:] 1714 default: 1715 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -cds2prot command\n") 1716 os.Exit(1) 1717 } 1718 } 1719 1720 txt := eutils.ReadAllIntoSequence(inp) 1721 1722 for i := 0; i < repeat; i++ { 1723 1724 // repeat multiple times for performance testing (undocumented) 1725 str := eutils.TranslateCdRegion(txt, genCode, frame, includeStop, doEveryCodon, removeTrailingX, is5primeComplete, is3primeComplete) 1726 1727 os.Stdout.WriteString(str) 1728 if !strings.HasSuffix(str, "\n") { 1729 os.Stdout.WriteString("\n") 1730 } 1731 } 1732} 1733 1734// MAIN FUNCTION 1735 1736func main() { 1737 1738 // skip past executable name 1739 args := os.Args[1:] 1740 1741 if len(args) < 1 { 1742 fmt.Fprintf(os.Stderr, "\nERROR: No command-line arguments supplied to transmute\n") 1743 os.Exit(1) 1744 } 1745 1746 // performance arguments 1747 chanDepth := 0 1748 farmSize := 0 1749 heapSize := 0 1750 numServe := 0 1751 goGc := 0 1752 1753 // processing option arguments 1754 doCompress := false 1755 doCleanup := false 1756 doStrict := false 1757 doMixed := false 1758 deAccent := false 1759 doASCII := false 1760 1761 /* 1762 doUnicode := false 1763 doScript := false 1764 doMathML := false 1765 */ 1766 1767 // CONCURRENCY, CLEANUP, AND DEBUGGING FLAGS 1768 1769 // do these first because -defcpu and -maxcpu can be sent from wrapper before other arguments 1770 1771 ncpu := runtime.NumCPU() 1772 if ncpu < 1 { 1773 ncpu = 1 1774 } 1775 1776 // wrapper can limit maximum number of processors to use (undocumented) 1777 maxProcs := 0 1778 defProcs := 0 1779 1780 // concurrent performance tuning parameters, can be overridden by -proc and -cons 1781 numProcs := 0 1782 serverRatio := 4 1783 1784 // -flag sets -strict or -mixed cleanup flags from argument 1785 flgs := "" 1786 1787 /* 1788 unicodePolicy := "" 1789 scriptPolicy := "" 1790 mathmlPolicy := "" 1791 */ 1792 1793 // read data from file instead of stdin 1794 fileName := "" 1795 1796 // debugging 1797 stts := false 1798 timr := false 1799 1800 // profiling 1801 prfl := false 1802 1803 inSwitch := true 1804 1805 // get concurrency, cleanup, and debugging flags in any order 1806 for { 1807 1808 inSwitch = true 1809 1810 switch args[0] { 1811 1812 // concurrency override arguments can be passed in by local wrapper script (undocumented) 1813 case "-maxcpu": 1814 maxProcs = eutils.GetNumericArg(args, "Maximum number of processors", 1, 1, ncpu) 1815 args = args[1:] 1816 case "-defcpu": 1817 defProcs = eutils.GetNumericArg(args, "Default number of processors", ncpu, 1, ncpu) 1818 args = args[1:] 1819 // performance tuning flags 1820 case "-proc": 1821 numProcs = eutils.GetNumericArg(args, "Number of processors", ncpu, 1, ncpu) 1822 args = args[1:] 1823 case "-cons": 1824 serverRatio = eutils.GetNumericArg(args, "Parser to processor ratio", 4, 1, 32) 1825 args = args[1:] 1826 case "-serv": 1827 numServe = eutils.GetNumericArg(args, "Concurrent parser count", 0, 1, 128) 1828 args = args[1:] 1829 case "-chan": 1830 chanDepth = eutils.GetNumericArg(args, "Communication channel depth", 0, ncpu, 128) 1831 args = args[1:] 1832 case "-heap": 1833 heapSize = eutils.GetNumericArg(args, "Unshuffler heap size", 8, 8, 64) 1834 args = args[1:] 1835 case "-farm": 1836 farmSize = eutils.GetNumericArg(args, "Node buffer length", 4, 4, 2048) 1837 args = args[1:] 1838 case "-gogc": 1839 goGc = eutils.GetNumericArg(args, "Garbage collection percentage", 0, 50, 1000) 1840 args = args[1:] 1841 1842 // read data from file 1843 case "-input": 1844 if len(args) < 2 { 1845 fmt.Fprintf(os.Stderr, "\nERROR: Input file name is missing\n") 1846 os.Exit(1) 1847 } 1848 fileName = args[1] 1849 // skip past first of two arguments 1850 args = args[1:] 1851 1852 // data cleanup flags 1853 case "-compress", "-compressed": 1854 doCompress = true 1855 case "-spaces", "-cleanup": 1856 doCleanup = true 1857 case "-strict": 1858 doStrict = true 1859 case "-mixed": 1860 doMixed = true 1861 case "-accent": 1862 deAccent = true 1863 case "-ascii": 1864 doASCII = true 1865 1866 // previously visible processing flags (undocumented) 1867 case "-stems", "-stem": 1868 // ignore 1869 case "-stops", "-stop": 1870 // ignore 1871 1872 // allow setting of unicode, script, and mathml flags (undocumented) 1873 case "-unicode": 1874 if len(args) < 2 { 1875 fmt.Fprintf(os.Stderr, "\nERROR: -unicode argument is missing\n") 1876 os.Exit(1) 1877 } 1878 // unicodePolicy = eutils.GetStringArg(args, "Unicode argument") 1879 args = args[1:] 1880 case "-script": 1881 if len(args) < 2 { 1882 fmt.Fprintf(os.Stderr, "\nERROR: -script argument is missing\n") 1883 os.Exit(1) 1884 } 1885 // scriptPolicy = eutils.GetStringArg(args, "Script argument") 1886 args = args[1:] 1887 case "-mathml": 1888 if len(args) < 2 { 1889 fmt.Fprintf(os.Stderr, "\nERROR: -mathml argument is missing\n") 1890 os.Exit(1) 1891 } 1892 // mathmlPolicy = eutils.GetStringArg(args, "MathML argument") 1893 args = args[1:] 1894 1895 case "-flag", "-flags": 1896 if len(args) < 2 { 1897 fmt.Fprintf(os.Stderr, "\nERROR: -flags argument is missing\n") 1898 os.Exit(1) 1899 } 1900 flgs = eutils.GetStringArg(args, "Flags argument") 1901 args = args[1:] 1902 1903 // debugging flags 1904 case "-stats", "-stat": 1905 stts = true 1906 case "-timer": 1907 timr = true 1908 case "-profile": 1909 prfl = true 1910 1911 default: 1912 // if not any of the controls, set flag to break out of for loop 1913 inSwitch = false 1914 } 1915 1916 if !inSwitch { 1917 break 1918 } 1919 1920 // skip past argument 1921 args = args[1:] 1922 1923 if len(args) < 1 { 1924 break 1925 } 1926 } 1927 1928 // -flag allows script to set -strict or -mixed (or -stems, or -stops) from argument 1929 switch flgs { 1930 case "strict": 1931 doStrict = true 1932 case "mixed": 1933 doMixed = true 1934 case "stems", "stem": 1935 // ignore 1936 case "stops", "stop": 1937 // ignore 1938 case "none", "default": 1939 default: 1940 if flgs != "" { 1941 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized -flag value '%s'\n", flgs) 1942 os.Exit(1) 1943 } 1944 } 1945 1946 /* 1947 UnicodeFix = ParseMarkup(unicodePolicy, "-unicode") 1948 ScriptFix = ParseMarkup(scriptPolicy, "-script") 1949 MathMLFix = ParseMarkup(mathmlPolicy, "-mathml") 1950 1951 if UnicodeFix != NOMARKUP { 1952 doUnicode = true 1953 } 1954 1955 if ScriptFix != NOMARKUP { 1956 doScript = true 1957 } 1958 1959 if MathMLFix != NOMARKUP { 1960 doMathML = true 1961 } 1962 */ 1963 1964 if numProcs == 0 { 1965 if defProcs > 0 { 1966 numProcs = defProcs 1967 } else if maxProcs > 0 { 1968 numProcs = maxProcs 1969 } 1970 } 1971 if numProcs > ncpu { 1972 numProcs = ncpu 1973 } 1974 if numProcs > maxProcs { 1975 numProcs = maxProcs 1976 } 1977 1978 eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc) 1979 1980 eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup) 1981 1982 // -stats prints number of CPUs and performance tuning values if no other arguments (undocumented) 1983 if stts && len(args) < 1 { 1984 1985 eutils.PrintStats() 1986 1987 return 1988 } 1989 1990 if len(args) < 1 { 1991 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to transmute\n") 1992 os.Exit(1) 1993 } 1994 1995 // DOCUMENTATION COMMANDS 1996 1997 inSwitch = true 1998 1999 switch args[0] { 2000 case "-version": 2001 fmt.Printf("%s\n", eutils.EDirectVersion) 2002 case "-help": 2003 fmt.Printf("transmute %s\n%s\n", eutils.EDirectVersion, transmuteHelp) 2004 case "-extra", "-extras": 2005 fmt.Printf("transmute %s\n%s\n", eutils.EDirectVersion, transmuteExtra) 2006 case "-degenerate": 2007 // generate new genetic code data tables (undocumented) 2008 eutils.GenerateGeneticCodeMaps() 2009 default: 2010 // if not any of the documentation commands, keep going 2011 inSwitch = false 2012 } 2013 2014 if inSwitch { 2015 return 2016 } 2017 2018 // FILE NAME CAN BE SUPPLIED WITH -input COMMAND 2019 2020 in := os.Stdin 2021 2022 // check for data being piped into stdin 2023 isPipe := false 2024 fi, err := os.Stdin.Stat() 2025 if err == nil { 2026 isPipe = bool((fi.Mode() & os.ModeNamedPipe) != 0) 2027 } 2028 2029 usingFile := false 2030 2031 if fileName != "" { 2032 2033 inFile, err := os.Open(fileName) 2034 if err != nil { 2035 fmt.Fprintf(os.Stderr, "\nERROR: Unable to open input file '%s'\n", fileName) 2036 os.Exit(1) 2037 } 2038 2039 defer inFile.Close() 2040 2041 // use indicated file instead of stdin 2042 in = inFile 2043 usingFile = true 2044 2045 if isPipe && runtime.GOOS != "windows" { 2046 mode := fi.Mode().String() 2047 fmt.Fprintf(os.Stderr, "\nERROR: Input data from both stdin and file '%s', mode is '%s'\n", fileName, mode) 2048 os.Exit(1) 2049 } 2050 } 2051 2052 // check for -input command after extraction arguments 2053 for _, str := range args { 2054 if str == "-input" { 2055 fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -input command\n") 2056 os.Exit(1) 2057 } 2058 } 2059 2060 // START PROFILING IF REQUESTED 2061 2062 if prfl { 2063 2064 f, err := os.Create("cpu.pprof") 2065 if err != nil { 2066 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create profile output file\n") 2067 os.Exit(1) 2068 } 2069 2070 pprof.StartCPUProfile(f) 2071 2072 defer pprof.StopCPUProfile() 2073 } 2074 2075 // INITIALIZE RECORD COUNT 2076 2077 recordCount := 0 2078 byteCount := 0 2079 2080 // print processing rate and program duration 2081 printDuration := func(name string) { 2082 2083 eutils.PrintDuration(name, recordCount, byteCount) 2084 } 2085 2086 nextArg := func() (string, bool) { 2087 2088 if len(args) < 1 { 2089 return "", false 2090 } 2091 2092 // remove next token from slice 2093 nxt := args[0] 2094 args = args[1:] 2095 2096 return nxt, true 2097 } 2098 2099 // The several converter functions that follow must be called 2100 // before CreateXMLStreamer starts draining stdin 2101 2102 // JSON TO XML CONVERTER 2103 2104 if args[0] == "-j2x" || args[0] == "-json2xml" { 2105 2106 // skip past command name 2107 args = args[1:] 2108 2109 set := "root" 2110 rec := "" 2111 nest := "" 2112 2113 // look for optional arguments 2114 for { 2115 arg, ok := nextArg() 2116 if !ok { 2117 break 2118 } 2119 2120 switch arg { 2121 case "-set": 2122 // override set wrapper 2123 set, ok = nextArg() 2124 if ok && set == "-" { 2125 set = "" 2126 } 2127 case "-rec": 2128 // override record wrapper 2129 rec, ok = nextArg() 2130 if ok && rec == "-" { 2131 rec = "" 2132 } 2133 case "-nest": 2134 // specify nested array naming policy 2135 nest, ok = nextArg() 2136 if !ok { 2137 fmt.Fprintf(os.Stderr, "Nested array naming policy is missing\n") 2138 os.Exit(1) 2139 } 2140 if ok && nest == "-" { 2141 nest = "flat" 2142 } 2143 switch nest { 2144 case "flat", "plural", "name", "recurse", "recursive", "same", "depth", "deep", "level": 2145 default: 2146 fmt.Fprintf(os.Stderr, "Unrecognized nested array naming policy\n") 2147 os.Exit(1) 2148 } 2149 default: 2150 // alternative form uses positional arguments to override set and rec 2151 set = arg 2152 if set == "-" { 2153 set = "" 2154 } 2155 rec, ok = nextArg() 2156 if ok && rec == "-" { 2157 rec = "" 2158 } 2159 } 2160 } 2161 2162 // use output channel of tokenizer as input channel of converter 2163 jcnv := eutils.JSONConverter(in, set, rec, nest) 2164 2165 if jcnv == nil { 2166 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create JSON to XML converter\n") 2167 os.Exit(1) 2168 } 2169 2170 // drain output of channel 2171 for str := range jcnv { 2172 2173 if str == "" { 2174 continue 2175 } 2176 2177 // send result to output 2178 os.Stdout.WriteString(str) 2179 if !strings.HasSuffix(str, "\n") { 2180 os.Stdout.WriteString("\n") 2181 } 2182 2183 recordCount++ 2184 runtime.Gosched() 2185 } 2186 2187 debug.FreeOSMemory() 2188 2189 if timr { 2190 printDuration("blocks") 2191 } 2192 2193 return 2194 } 2195 2196 // ASN.1 TO XML CONVERTER 2197 2198 if args[0] == "-a2x" || args[0] == "-asn2xml" { 2199 2200 // skip past command name 2201 args = args[1:] 2202 2203 set := "" 2204 rec := "" 2205 2206 // look for optional arguments 2207 for { 2208 arg, ok := nextArg() 2209 if !ok { 2210 break 2211 } 2212 2213 switch arg { 2214 case "-set": 2215 // override set wrapper 2216 set, ok = nextArg() 2217 if ok && set == "-" { 2218 set = "" 2219 } 2220 case "-rec": 2221 // override record wrapper 2222 rec, ok = nextArg() 2223 if ok && rec == "-" { 2224 rec = "" 2225 } 2226 } 2227 } 2228 2229 acnv := eutils.ASN1Converter(in, set, rec) 2230 2231 if acnv == nil { 2232 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create ASN.1 to XML converter\n") 2233 os.Exit(1) 2234 } 2235 2236 // drain output of channel 2237 for str := range acnv { 2238 2239 if str == "" { 2240 continue 2241 } 2242 2243 // send result to output 2244 os.Stdout.WriteString(str) 2245 if !strings.HasSuffix(str, "\n") { 2246 os.Stdout.WriteString("\n") 2247 } 2248 2249 recordCount++ 2250 runtime.Gosched() 2251 } 2252 2253 debug.FreeOSMemory() 2254 2255 if timr { 2256 printDuration("blocks") 2257 } 2258 2259 return 2260 } 2261 2262 // READ TAB-DELIMITED FILE AND WRAP IN XML FIELDS 2263 2264 doTable := func(delim string) { 2265 2266 // skip past command name 2267 args = args[1:] 2268 2269 set := "" 2270 rec := "" 2271 2272 skip := 0 2273 header := false 2274 lower := false 2275 upper := false 2276 indent := true 2277 2278 var fields []string 2279 numFlds := 0 2280 2281 for len(args) > 0 { 2282 str := args[0] 2283 switch str { 2284 case "-set": 2285 args = args[1:] 2286 if len(args) < 1 { 2287 fmt.Fprintf(os.Stderr, "\nERROR: No argument after -set\n") 2288 os.Exit(1) 2289 } 2290 set = args[0] 2291 args = args[1:] 2292 case "-rec": 2293 args = args[1:] 2294 if len(args) < 1 { 2295 fmt.Fprintf(os.Stderr, "\nERROR: No argument after -rec\n") 2296 os.Exit(1) 2297 } 2298 rec = args[0] 2299 args = args[1:] 2300 case "-skip": 2301 args = args[1:] 2302 if len(args) < 1 { 2303 fmt.Fprintf(os.Stderr, "\nERROR: No argument after -skip\n") 2304 os.Exit(1) 2305 } 2306 tmp := args[0] 2307 val, err := strconv.Atoi(tmp) 2308 if err != nil { 2309 fmt.Fprintf(os.Stderr, "\nERROR: -skip argument (%s) is not an integer\n", tmp) 2310 os.Exit(1) 2311 } 2312 skip = val 2313 args = args[1:] 2314 case "-header", "-headers", "-heading": 2315 header = true 2316 args = args[1:] 2317 case "-lower": 2318 lower = true 2319 args = args[1:] 2320 case "-upper": 2321 upper = true 2322 args = args[1:] 2323 case "-indent": 2324 indent = true 2325 args = args[1:] 2326 case "-flush": 2327 indent = false 2328 args = args[1:] 2329 default: 2330 // remaining arguments are names for columns 2331 if str != "" && str != "*" { 2332 fields = append(fields, str) 2333 numFlds++ 2334 } 2335 args = args[1:] 2336 } 2337 } 2338 2339 if numFlds < 1 && !header { 2340 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient arguments for table converter\n") 2341 os.Exit(1) 2342 } 2343 2344 tble := eutils.TableConverter(in, delim, set, rec, skip, header, lower, upper, indent, fields) 2345 2346 if tble == nil { 2347 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create table to XML converter\n") 2348 os.Exit(1) 2349 } 2350 2351 // drain output of channel 2352 for str := range tble { 2353 2354 if str == "" { 2355 continue 2356 } 2357 2358 // send result to output 2359 os.Stdout.WriteString(str) 2360 if !strings.HasSuffix(str, "\n") { 2361 os.Stdout.WriteString("\n") 2362 } 2363 2364 recordCount++ 2365 runtime.Gosched() 2366 } 2367 2368 debug.FreeOSMemory() 2369 2370 if timr { 2371 printDuration("lines") 2372 } 2373 } 2374 2375 if len(args) > 1 && args[0] == "-t2x" { 2376 2377 doTable("\t") 2378 return 2379 } 2380 2381 if len(args) > 1 && args[0] == "-c2x" { 2382 2383 doTable(",") 2384 return 2385 } 2386 2387 // READ GENBANK FLATFILE AND TRANSLATE TO INSDSEQ XML 2388 2389 if len(args) > 0 && args[0] == "-g2x" { 2390 2391 gbk := eutils.GenBankConverter(in) 2392 2393 if gbk == nil { 2394 fmt.Fprintf(os.Stderr, "Unable to create GenBank to XML converter\n") 2395 os.Exit(1) 2396 } 2397 2398 head := `<?xml version="1.0" encoding="UTF-8" ?> 2399<!DOCTYPE INSDSet PUBLIC "-//NCBI//INSD INSDSeq/EN" "https://www.ncbi.nlm.nih.gov/dtd/INSD_INSDSeq.dtd"> 2400<INSDSet> 2401` 2402 tail := "" 2403 2404 // drain output of last channel in service chain 2405 for str := range gbk { 2406 2407 if str == "" { 2408 continue 2409 } 2410 2411 if head != "" { 2412 os.Stdout.WriteString(head) 2413 head = "" 2414 tail = `</INSDSet> 2415` 2416 } 2417 2418 // send result to stdout 2419 os.Stdout.WriteString(str) 2420 if !strings.HasSuffix(str, "\n") { 2421 os.Stdout.WriteString("\n") 2422 } 2423 2424 recordCount++ 2425 2426 runtime.Gosched() 2427 } 2428 2429 if tail != "" { 2430 os.Stdout.WriteString(tail) 2431 } 2432 2433 debug.FreeOSMemory() 2434 2435 if timr { 2436 printDuration("records") 2437 } 2438 2439 return 2440 } 2441 2442 // STRING CONVERSION COMMANDS 2443 2444 inSwitch = true 2445 2446 switch args[0] { 2447 case "-encodeURL": 2448 encodeURL(in) 2449 case "-decodeURL": 2450 decodeURL(in) 2451 case "-encode64", "-encodeB64", "-encodeBase64": 2452 encodeB64(in) 2453 case "-decode64", "-decodeB64", "-decodeBase64": 2454 decodeB64(in) 2455 case "-hgvs": 2456 decodeHGVS(in) 2457 case "-align": 2458 processAlign(in, args) 2459 case "-remove": 2460 sequenceRemove(in, args) 2461 case "-retain": 2462 sequenceRetain(in, args) 2463 case "-replace": 2464 sequenceReplace(in, args) 2465 case "-extract": 2466 sequenceExtract(in, args) 2467 case "-revcomp": 2468 nucRevComp(in) 2469 case "-reverse": 2470 seqFlip(in) 2471 case "-molwt": 2472 protWeight(in, args) 2473 case "-cds2prot": 2474 cdRegionToProtein(in, args) 2475 case "-diff": 2476 fastaDiff(in, args) 2477 default: 2478 // if not any of the conversion commands, keep going 2479 inSwitch = false 2480 } 2481 2482 if inSwitch { 2483 2484 debug.FreeOSMemory() 2485 2486 return 2487 } 2488 2489 // CREATE XML BLOCK READER FROM STDIN OR FILE 2490 2491 rdr := eutils.CreateXMLStreamer(in) 2492 if rdr == nil { 2493 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create XML Block Reader\n") 2494 os.Exit(1) 2495 } 2496 2497 // CONFIRM INPUT DATA AVAILABILITY AFTER RUNNING COMMAND GENERATORS 2498 2499 if fileName == "" && runtime.GOOS != "windows" { 2500 2501 fromStdin := bool((fi.Mode() & os.ModeCharDevice) == 0) 2502 if !isPipe || !fromStdin { 2503 mode := fi.Mode().String() 2504 fmt.Fprintf(os.Stderr, "\nERROR: No data supplied to transmute from stdin or file, mode is '%s'\n", mode) 2505 os.Exit(1) 2506 } 2507 } 2508 2509 if !usingFile && !isPipe { 2510 2511 fmt.Fprintf(os.Stderr, "\nERROR: No XML input data supplied to transmute\n") 2512 os.Exit(1) 2513 } 2514 2515 // SPECIAL FORMATTING COMMANDS 2516 2517 inSwitch = true 2518 leaf := false 2519 2520 switch args[0] { 2521 case "-format": 2522 processFormat(rdr, args) 2523 case "-filter": 2524 processFilter(rdr, args) 2525 case "-normalize", "-normal": 2526 if len(args) < 2 { 2527 fmt.Fprintf(os.Stderr, "\nERROR: No database supplied to -normalize\n") 2528 os.Exit(1) 2529 } 2530 db := args[1] 2531 nrm := eutils.NormalizeXML(rdr, db) 2532 eutils.ChanToStdout(nrm) 2533 case "-outline": 2534 processOutline(rdr) 2535 case "-contour": 2536 leaf = true 2537 fallthrough 2538 case "-synopsis": 2539 args = args[1:] 2540 delim := "/" 2541 if len(args) > 0 { 2542 delim = args[0] 2543 if len(delim) > 3 { 2544 delim = "/" 2545 } 2546 } 2547 processSynopsis(rdr, leaf, delim) 2548 case "-tokens": 2549 processTokens(rdr) 2550 default: 2551 // if not any of the formatting commands, keep going 2552 inSwitch = false 2553 } 2554 2555 if inSwitch { 2556 2557 debug.FreeOSMemory() 2558 2559 // suppress printing of lines if not properly counted 2560 if recordCount == 1 { 2561 recordCount = 0 2562 } 2563 2564 if timr { 2565 printDuration("lines") 2566 } 2567 2568 return 2569 } 2570 2571 // SPECIFY STRINGS TO GO BEFORE AND AFTER ENTIRE OUTPUT OR EACH RECORD 2572 2573 head := "" 2574 tail := "" 2575 2576 hd := "" 2577 tl := "" 2578 2579 for { 2580 2581 inSwitch = true 2582 2583 switch args[0] { 2584 case "-head": 2585 if len(args) < 2 { 2586 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -head command\n") 2587 os.Exit(1) 2588 } 2589 head = eutils.ConvertSlash(args[1]) 2590 case "-tail": 2591 if len(args) < 2 { 2592 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -tail command\n") 2593 os.Exit(1) 2594 } 2595 tail = eutils.ConvertSlash(args[1]) 2596 case "-hd": 2597 if len(args) < 2 { 2598 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -hd command\n") 2599 os.Exit(1) 2600 } 2601 hd = eutils.ConvertSlash(args[1]) 2602 case "-tl": 2603 if len(args) < 2 { 2604 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -tl command\n") 2605 os.Exit(1) 2606 } 2607 tl = eutils.ConvertSlash(args[1]) 2608 case "-wrp": 2609 // shortcut to wrap records in XML tags 2610 if len(args) < 2 { 2611 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -wrp command\n") 2612 os.Exit(1) 2613 } 2614 tmp := eutils.ConvertSlash(args[1]) 2615 lft, rgt := eutils.SplitInTwoLeft(tmp, ",") 2616 if lft != "" { 2617 head = "<" + lft + ">" 2618 tail = "</" + lft + ">" 2619 } 2620 if rgt != "" { 2621 hd = "<" + rgt + ">" 2622 tl = "</" + rgt + ">" 2623 } 2624 case "-set": 2625 if len(args) < 2 { 2626 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -set command\n") 2627 os.Exit(1) 2628 } 2629 tmp := eutils.ConvertSlash(args[1]) 2630 if tmp != "" { 2631 head = "<" + tmp + ">" 2632 tail = "</" + tmp + ">" 2633 } 2634 case "-rec": 2635 if len(args) < 2 { 2636 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -rec command\n") 2637 os.Exit(1) 2638 } 2639 tmp := eutils.ConvertSlash(args[1]) 2640 if tmp != "" { 2641 hd = "<" + tmp + ">" 2642 tl = "</" + tmp + ">" 2643 } 2644 default: 2645 // if not any of the controls, set flag to break out of for loop 2646 inSwitch = false 2647 } 2648 2649 if !inSwitch { 2650 break 2651 } 2652 2653 // skip past arguments 2654 args = args[2:] 2655 2656 if len(args) < 1 { 2657 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to transmute\n") 2658 os.Exit(1) 2659 } 2660 } 2661 2662 // ENSURE PRESENCE OF PATTERN ARGUMENT 2663 2664 if len(args) < 1 { 2665 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to transmute\n") 2666 os.Exit(1) 2667 } 2668 2669 // allow -record as synonym of -pattern (undocumented) 2670 if args[0] == "-record" || args[0] == "-Record" { 2671 args[0] = "-pattern" 2672 } 2673 2674 // make sure top-level -pattern command is next 2675 if args[0] != "-pattern" && args[0] != "-Pattern" { 2676 fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n") 2677 os.Exit(1) 2678 } 2679 if len(args) < 2 { 2680 fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n") 2681 os.Exit(1) 2682 } 2683 2684 topPat := args[1] 2685 if topPat == "" { 2686 fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n") 2687 os.Exit(1) 2688 } 2689 if strings.HasPrefix(topPat, "-") { 2690 fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", topPat) 2691 os.Exit(1) 2692 } 2693 2694 // look for -pattern Parent/* construct for heterogeneous data, e.g., -pattern PubmedArticleSet/* 2695 topPattern, star := eutils.SplitInTwoLeft(topPat, "/") 2696 if topPattern == "" { 2697 return 2698 } 2699 2700 // CONCURRENT REFORMATTING OF PARSED XML RECORDS 2701 2702 // -pattern plus -format does concurrent flush-left reformatting 2703 if len(args) > 2 && args[2] == "-format" { 2704 2705 format := "flush" 2706 if len(args) > 3 { 2707 format = args[3] 2708 if strings.HasPrefix(format, "-") { 2709 format = "flush" 2710 } 2711 } 2712 2713 xmlq := eutils.CreateXMLProducer(topPattern, star, rdr) 2714 fchq := createFormatters(topPattern, format, xmlq) 2715 unsq := eutils.CreateXMLUnshuffler(fchq) 2716 2717 if xmlq == nil || fchq == nil || unsq == nil { 2718 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create formatter\n") 2719 os.Exit(1) 2720 } 2721 2722 if head != "" { 2723 os.Stdout.WriteString(head) 2724 os.Stdout.WriteString("\n") 2725 } 2726 2727 // drain output channel 2728 for curr := range unsq { 2729 2730 str := curr.Text 2731 2732 if str == "" { 2733 continue 2734 } 2735 2736 if hd != "" { 2737 os.Stdout.WriteString(hd) 2738 os.Stdout.WriteString("\n") 2739 } 2740 2741 // send result to output 2742 os.Stdout.WriteString(str) 2743 if !strings.HasSuffix(str, "\n") { 2744 os.Stdout.WriteString("\n") 2745 } 2746 2747 if tl != "" { 2748 os.Stdout.WriteString(tl) 2749 os.Stdout.WriteString("\n") 2750 } 2751 2752 recordCount++ 2753 runtime.Gosched() 2754 } 2755 2756 if tail != "" { 2757 os.Stdout.WriteString(tail) 2758 os.Stdout.WriteString("\n") 2759 } 2760 2761 debug.FreeOSMemory() 2762 2763 if timr { 2764 printDuration("records") 2765 } 2766 2767 return 2768 } 2769 2770 // REPORT UNRECOGNIZED COMMAND 2771 2772 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized transmute command\n") 2773 os.Exit(1) 2774} 2775