1// =========================================================================== 2// 3// PUBLIC DOMAIN NOTICE 4// National Center for Biotechnology Information (NCBI) 5// 6// This software/database is a "United States Government Work" under the 7// terms of the United States Copyright Act. It was written as part of 8// the author's official duties as a United States Government employee and 9// thus cannot be copyrighted. This software/database is freely available 10// to the public for use. The National Library of Medicine and the U.S. 11// Government do not place any restriction on its use or reproduction. 12// We would, however, appreciate having the NCBI and the author cited in 13// any work or product based on this material. 14// 15// Although all reasonable efforts have been taken to ensure the accuracy 16// and reliability of the software and data, the NLM and the U.S. 17// Government do not and cannot warrant the performance or results that 18// may be obtained by using this software or data. The NLM and the U.S. 19// Government disclaim all warranties, express or implied, including 20// warranties of performance, merchantability or fitness for any particular 21// purpose. 22// 23// =========================================================================== 24// 25// File Name: xtract.go 26// 27// Author: Jonathan Kans 28// 29// ========================================================================== 30 31package main 32 33import ( 34 "bufio" 35 "encoding/base64" 36 "encoding/hex" 37 "eutils" 38 "fmt" 39 "github.com/fatih/color" 40 "github.com/surgebase/porter2" 41 "html" 42 "math" 43 "net/url" 44 "os" 45 "path" 46 "path/filepath" 47 "regexp" 48 "runtime" 49 "runtime/debug" 50 "runtime/pprof" 51 "sort" 52 "strconv" 53 "strings" 54 "sync" 55 "time" 56 "unicode" 57) 58 59// XTRACT HELP MESSAGE TEXT 60 61const xtractHelp = ` 62Overview 63 64 Xtract uses command-line arguments to convert XML data into a tab-delimited table. 65 66 -pattern places the data from individual records into separate rows. 67 68 -element extracts values from specified fields into separate columns. 69 70 -group, -block, and -subset limit element exploration to selected XML subregions. 71 72Processing Flags 73 74 -strict Remove HTML and MathML tags 75 -mixed Allow mixed content XML 76 77 -accent Excise Unicode accents and diacritical marks 78 -ascii Unicode to numeric HTML character entities 79 -compress Compress runs of spaces 80 81 -stops Retain stop words in selected phrases 82 83Data Source 84 85 -input Read XML from file instead of stdin 86 -transform File of substitutions for -translate 87 88Exploration Argument Hierarchy 89 90 -pattern Name of record within set 91 -group Use of different argument 92 -block names allows command-line 93 -subset control of nested looping 94 95Path Navigation 96 97 -path Explore by list of adjacent object names 98 99Exploration Constructs 100 101 Object DateRevised 102 Parent/Child Book/AuthorList 103 Path MedlineCitation/Article/Journal/JournalIssue/PubDate 104 Heterogeneous "PubmedArticleSet/*" 105 Exhaustive "History/**" 106 Nested "*/Taxon" 107 Recursive "**/Gene-commentary" 108 109Conditional Execution 110 111 -if Element [@attribute] required 112 -unless Skip if element matches 113 -and All tests must pass 114 -or Any passing test suffices 115 -else Execute if conditional test failed 116 -position [first|last|outer|inner|even|odd|all] 117 118String Constraints 119 120 -equals String must match exactly 121 -contains Substring must be present 122 -is-within String must be present 123 -starts-with Substring must be at beginning 124 -ends-with Substring must be at end 125 -is-not String must not match 126 -is-before First string < second string 127 -is-after First string > second string 128 -matches Matches without commas or semicolons 129 -resembles Requires all words, but in any order 130 131Object Constraints 132 133 -is-equal-to Object values must match 134 -differs-from Object values must differ 135 136Numeric Constraints 137 138 -gt Greater than 139 -ge Greater than or equal to 140 -lt Less than 141 -le Less than or equal to 142 -eq Equal to 143 -ne Not equal to 144 145Format Customization 146 147 -ret Override line break between patterns 148 -tab Replace tab character between fields 149 -sep Separator between group members 150 -pfx Prefix to print before group 151 -sfx Suffix to print after group 152 -rst Reset -sep through -elg 153 -clr Clear queued tab separator 154 -pfc Preface combines -clr and -pfx 155 -deq Delete and replace queued tab separator 156 -def Default placeholder for missing fields 157 -lbl Insert arbitrary text 158 159XML Generation 160 161 -set XML tag for entire set 162 -rec XML tag for each record 163 164 -wrp Wrap elements in XML object 165 166 -enc Encase instance in XML object 167 -plg Prologue to print before instance 168 -elg Epilogue to print after instance 169 170 -pkg Package subset in XML object 171 -fwd Foreword to print before subset 172 -awd Afterword to print after subset 173 174Element Selection 175 176 -element Print all items that match tag name 177 -first Only print value of first item 178 -last Only print value of last item 179 -NAME Record value in named variable 180 --STATS Accumulate values into variable 181 182-element Constructs 183 184 Tag Caption 185 Group Initials,LastName 186 Parent/Child MedlineCitation/PMID 187 Recursive "**/Gene-commentary_accession" 188 Unrestricted "PubDate/*" 189 Attribute DescriptorName@MajorTopicYN 190 Range MedlineDate[1:4] 191 Substring "Title[phospholipase | rattlesnake]" 192 Object Count "#Author" 193 Item Length "%Title" 194 Element Depth "^PMID" 195 Variable "&NAME" 196 197Special -element Operations 198 199 Parent Index "+" 200 Object Name "?" 201 XML Subtree "*" 202 Children "$" 203 Attributes "@" 204 205Numeric Processing 206 207 -num Count 208 -len Length 209 -sum Sum 210 -min Minimum 211 -max Maximum 212 -inc Increment 213 -dec Decrement 214 -sub Difference 215 -avg Average 216 -dev Deviation 217 -med Median 218 -mul Product 219 -div Quotient 220 -mod Remainder 221 -bin Binary 222 -bit Bit Count 223 224String Processing 225 226 -encode XML-encode <, >, &, ", and ' characters 227 -plain Remove embedded mixed-content markup tags 228 -upper Convert text to upper-case 229 -lower Convert text to lower-case 230 -chain Change_spaces_to_underscores 231 -title Capitalize initial letters of words 232 -year Extract first 4-digit year from string 233 -doi Add https://doi.org/ prefix, URL encode 234 -translate Substitute values with -transform table 235 236Text Processing 237 238 -terms Partition text at spaces 239 -words Split at punctuation marks 240 -pairs Adjacent informative words 241 -order Rearrange words in sorted order 242 -reverse Reverse words in string 243 -letters Separate individual letters 244 -clauses Break at phrase separators 245 246Regular Expression 247 248 -replace Substitute text using regular expressions 249 250 -reg Target expression 251 -exp Replacement pattern 252 253Sequence Processing 254 255 -revcomp Reverse complement nucleotide sequence 256 -nucleic Subrange determines forward or revcomp 257 -fasta Split sequence into blocks of 50 letters 258 -ncbi2na Expand ncbi2na to iupac 259 -ncbi4na Expand ncbi4na to iupac 260 (May need to truncate result to actual sequence length) 261 -molwt Calculate molecular weight of peptide 262 263Sequence Coordinates 264 265 -0-based Zero-Based 266 -1-based One-Based 267 -ucsc-based Half-Open 268 269Command Generator 270 271 -insd Generate INSDSeq extraction commands 272 273-insd Argument Order 274 275 Descriptors INSDSeq_sequence INSDSeq_definition INSDSeq_division 276 Flags [complete|partial] 277 Feature(s) CDS,mRNA 278 Qualifiers INSDFeature_key "#INSDInterval" gene product feat_location sub_sequence 279 280Variation Processing 281 282 -hgvs Convert sequence variation format to XML 283 284Frequency Table 285 286 -histogram Collects data for sort-uniq-count on entire set of records 287 288Entrez Indexing 289 290 -e2index Create Entrez index XML 291 -indices Index normalized words 292 293Output Organization 294 295 -head Print before everything else 296 -tail Print after everything else 297 -hd Print before each record 298 -tl Print after each record 299 300Record Selection 301 302 -select Select record subset by conditions 303 -in File of identifiers to use for selection 304 305Record Rearrangement 306 307 -sort Element to use as sort key 308 309Reformatting 310 311 -format [copy|compact|flush|indent|expand] 312 313Validation 314 315 -verify Report XML data integrity problems 316 317Summary 318 319 -outline Display outline of XML structure 320 -synopsis Display individual XML paths 321 -contour Display XML paths to leaf nodes 322 [delimiter] 323 324Documentation 325 326 -help Print this document 327 -examples Examples of EDirect and xtract usage 328 -unix Common Unix command arguments 329 -version Print version number 330 331Notes 332 333 String constraints use case-insensitive comparisons. 334 335 Numeric constraints and selection arguments use integer values. 336 337 -num and -len selections are synonyms for Object Count (#) and Item Length (%). 338 339 -words, -pairs, -reverse, and -indices convert to lower case. 340 341 See transmute -help for data conversion and modification functions. 342 343Xtract Examples 344 345 -pattern DocumentSummary -element Id -first Name Title 346 347 -pattern "PubmedArticleSet/*" -block Author -sep " " -element Initials,LastName 348 349 -pattern PubmedArticle -block MeshHeading -if "@MajorTopicYN" -equals Y -sep " / " -element DescriptorName,QualifierName 350 351 -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop 352 353 -pattern Taxon -block "*/Taxon" -unless Rank -equals "no rank" -tab "\n" -element Rank,ScientificName 354 355 -pattern Entrezgene -block "**/Gene-commentary" 356 357 -block INSDReference -position 2 358 359 -subset INSDInterval -position last -POS INSDInterval_to -element "&SEQ[&POS+1:]" 360 361 -if Author -and Title 362 363 -if "#Author" -lt 6 -and "%Title" -le 70 364 365 -if DateRevised/Year -gt 2005 366 367 -if ChrStop -lt ChrStart 368 369 -if CommonName -contains mouse 370 371 -if "&ABST" -starts-with "Transposable elements" 372 373 -if MapLocation -element MapLocation -else -lbl "\-" 374 375 -if inserted_sequence -differs-from deleted_sequence 376 377 -min ChrStart,ChrStop 378 379 -max ExonCount 380 381 -inc position -element inserted_sequence 382 383 -1-based ChrStart 384 385 -insd CDS gene product protein_id translation 386 387 -insd complete mat_peptide "%peptide" product peptide 388 389 -insd CDS INSDInterval_iscomp@value INSDInterval_from INSDInterval_to 390 391 -pattern PubmedArticle -select PubDate/Year -eq 2015 392 393 -pattern PubmedArticle -select MedlineCitation/PMID -in file_of_pmids.txt 394 395 -wrp PubmedArticleSet -pattern PubmedArticle -sort MedlineCitation/PMID 396 397 -pattern PubmedArticle -split 5000 -prefix "subset" -suffix "xml" 398 399 -pattern PubmedBookArticle -path BookDocument.Book.AuthorList.Author -element LastName 400 401 -pattern PubmedArticle -group MedlineCitation/Article/Journal/JournalIssue/PubDate -year "PubDate/*" 402 403 -mixed -verify MedlineCitation/PMID -html 404 405Transmute Examples 406 407 transmute -j2x -set - -rec GeneRec 408 409 transmute -t2x -set Set -rec Rec -skip 1 Code Name 410 411 transmute -filter ExpXml decode content 412 413 transmute -filter LocationHist remove object 414 415 transmute -normalize pubmed 416 417 transmute -head "<PubmedArticleSet>" -tail "</PubmedArticleSet>" -pattern "PubmedArticleSet/*" -format 418` 419 420const xtractInternal = ` 421Performance Default Overrides 422 423 -proc Number of CPU processors used 424 -cons Ratio of parsers to processors 425 -serv Concurrent parser instances 426 -chan Communication channel depth 427 -heap Order restoration heap size 428 -farm Node allocation buffer length 429 -gogc Garbage collection tuning knob 430 431Internal Component Performance 432 433 -chunk StreamBlocks 434 -split StreamBlocks -> SplitPattern 435 -token StreamBlocks -> StreamTokens 436 437Debugging 438 439 -debug Display run-time parameter summary 440 -empty Flag records with no output 441 -ident Print record index numbers 442 -stats Show processing time for each record 443 -timer Report processing duration and rate 444 -trial Optimize -proc value, requires -input 445 446Documentation 447 448 -keys Keyboard navigation shortcuts 449 -unix Common Unix commands 450 451Performance Tuning Script 452 453 XtractTrials() { 454 echo -e "<Trials>" 455 for tries in {1..5} 456 do 457 xtract -debug -input "$1" -proc "$2" -pattern PubmedArticle -element LastName 458 done 459 echo -e "</Trials>" 460 } 461 462 for proc in {1..8} 463 do 464 XtractTrials "carotene.xml" "$proc" | 465 xtract -pattern Trials -lbl "$proc" -avg Rate -dev Rate 466 done 467 468Processor Titration Results 469 470 1 27622 31 471 2 51799 312 472 3 74853 593 473 4 95867 1337 474 5 97171 4019 475 6 93460 2458 476 7 87467 1030 477 8 82448 2651 478 479Entrez Index Performance Measurement 480 481 IndexTrials() { 482 echo -e "<Trials>" 483 for tries in {1..5} 484 do 485 cat "$1" | xtract -debug -proc "$2" -e2index 486 done 487 echo -e "</Trials>" 488 } 489 490 for proc in {1..8} 491 do 492 IndexTrials "carotene.xml" "$proc" | 493 xtract -pattern Trials -lbl "$proc" -avg Rate -dev Rate 494 done 495 496MeSH Tree Index Preparation 497 498 ftp-cp nlmpubs.nlm.nih.gov online/mesh/MESH_FILES/xmlmesh desc2021.zip 499 unzip desc2021.zip 500 rm desc2021.zip 501 502 cat desc2021.xml | 503 xtract -pattern DescriptorRecord -element "DescriptorRecord/DescriptorUI" \ 504 -sep "," -element TreeNumber > meshtree.txt 505 506Execution Profiling 507 508 cat carotene.xml > /dev/null 509 ./xtract -profile -timer -input carotene.xml -pattern PubmedArticle -element LastName > /dev/null 510 go tool pprof --pdf ./cpu.pprof > ~/Desktop/callgraph.pdf 511 rm cpu.pprof 512` 513 514const keyboardShortcuts = ` 515Command History 516 517 Ctrl-n Next command 518 Ctrl-p Previous command 519 520Move Cursor Forward 521 522 Ctrl-e To end of line 523 Ctrl-f By one character 524 Esc-f By one argument 525 526Move Cursor Backward 527 528 Ctrl-a To beginning of line 529 Ctrl-b By one character 530 Esc-b By one argument 531 532Delete 533 534 Del Previous character 535 Ctrl-d Next character 536 Ctrl-k To end of line 537 Ctrl-u Entire line 538 Ctrl-w Previous word 539 Esc-Del Previous argument 540 Esc-d Next argument 541 542Autocomplete 543 544 Tab Completes directory or file names 545 546Program Control 547 548 Ctrl-c Quit running program 549 ^x^y Run last command replacing x with y 550 Ctrl-z Suspend foreground job 551 kill %% Quit suspended script 552` 553 554const unixCommands = ` 555Process by Contents 556 557 sort Sorts lines of text 558 559 -f Ignore case 560 -n Numeric comparison 561 -r Reverse result order 562 563 -k Field key (start,stop or first) 564 -u Unique lines with identical keys 565 566 -b Ignore leading blanks 567 -s Stable sort 568 -t Specify field separator 569 570 uniq Removes repeated lines 571 572 -c Count occurrences 573 -i Ignore case 574 575 -f Ignore first n fields 576 -s Ignore first n characters 577 578 -d Only output repeated lines 579 -u Only output non-repeated lines 580 581 grep Matches patterns using regular expressions 582 583 -i Ignore case 584 -v Invert search 585 -w Search expression as a word 586 -x Search expression as whole line 587 588 -e Specify individual pattern 589 590 -c Only count number of matches 591 -n Print line numbers 592 -A Number of lines after match 593 -B Number of lines before match 594 595Regular Expressions 596 597 Characters 598 599 . Any single character (except newline) 600 \w Alphabetic [A-Za-z], numeric [0-9], or underscore (_) 601 \s Whitespace (space or tab) 602 \ Escapes special characters 603 [] Matches any enclosed characters 604 605 Positions 606 607 ^ Beginning of line 608 $ End of line 609 \b Word boundary 610 611 Repeat Matches 612 613 ? 0 or 1 614 * 0 or more 615 + 1 or more 616 {n} Exactly n 617 618 Escape Sequences 619 620 \n Line break 621 \t Tab character 622 623Modify Contents 624 625 sed Replaces text strings 626 627 -e Specify individual expression 628 s/// Substitute 629 /g Global 630 /I Case-insensitive 631 /p Print 632 633 tr Translates characters 634 635 -d Delete character 636 -s Squeeze runs of characters 637 638 rev Reverses characters on line 639 640Format Contents 641 642 column Aligns columns by content width 643 644 -s Specify field separator 645 -t Create table 646 647 expand Aligns columns to specified positions 648 649 -t Tab positions 650 651 fold Wraps lines at a specific width 652 653 -w Line width 654 -s Fold at spaces 655 656Filter by Position 657 658 cut Removes parts of lines 659 660 -c Characters to keep 661 -f Fields to keep 662 -d Specify field separator 663 -s Suppress lines with no delimiters 664 665 head Prints first lines 666 667 -n Number of lines 668 669 tail Prints last lines 670 671 -n Number of lines 672 673Miscellaneous 674 675 wc Counts words, lines, or characters 676 677 -c Characters 678 -l Lines 679 -w Words 680 681 xargs Constructs arguments 682 683 -n Number of words per batch 684 685 mktemp Make temporary file 686 687File Compression 688 689 tar Archive files 690 691 -c Create archive 692 -f Name of output file 693 -z Compress archive with gzip 694 695 gzip Compress file 696 697 -k Keep original file 698 -9 Best compression 699 700 unzip Decompress .zip archive 701 702 -p Pipe to stdout 703 704 gzcat Decompress .gz archive and pipe to stdout 705 706Directory and File Navigation 707 708 cd Changes directory 709 710 / Root 711 ~ Home 712 . Current 713 .. Parent 714 - Previous 715 716 ls Lists file names 717 718 -1 One entry per line 719 -a Show files beginning with dot (.) 720 -l List in long format 721 -R Recursively explore subdirectories 722 -S Sort files by size 723 -t Sort by most recently modified 724 .* Current and parent directory 725 726 pwd Prints working directory path 727 728File Redirection 729 730 < Read stdin from file 731 > Redirect stdout to file 732 >> Append to file 733 2> Redirect stderr 734 2>&1 Merge stderr into stdout 735 | Pipe between programs 736 <(cmd) Execute command, read results as file 737 738Shell Script Variables 739 740 $0 Name of script 741 $n Nth argument 742 $# Number of arguments 743 "$*" Argument list as one argument 744 "$@" Argument list as separate arguments 745 $? Exit status of previous command 746 747Shell Script Tests 748 749 -d Directory exists 750 -f File exists 751 -s File is not empty 752 -n Length of string is non-zero 753 -x File is executable 754 -z Variable is empty or not set 755 756File and Directory Extraction 757 758 BAS=$(printf pubmed%03d $n) 759 DIR=$(dirname "$0") 760 FIL=$(basename "$0") 761 762Remove Prefix 763 764 FILE="example.tar.gz" 765 # ${FILE#.*} -> tar.gz 766 ## ${FILE##.*} -> gz 767 768Remove Suffix 769 770 FILE="example.tar.gz" 771 TYPE="http://identifiers.org/uniprot_enzymes/" 772 % ${FILE%.*} -> example.tar 773 ${TYPE%/} -> http://identifiers.org/uniprot_enzymes 774 %% ${FILE%%.*} -> example 775` 776 777// GLOBAL VARIABLES 778 779var ( 780 doStem bool 781 deStop bool 782) 783 784// TYPED CONSTANTS 785 786// LevelType is the integer type for exploration arguments 787type LevelType int 788 789// LevelType keys for exploration arguments 790const ( 791 _ LevelType = iota 792 UNIT 793 SUBSET 794 SECTION 795 BLOCK 796 BRANCH 797 GROUP 798 DIVISION 799 PATH 800 PATTERN 801) 802 803// IndentType is the integer type for XML formatting 804type IndentType int 805 806// IndentType keys for XML formatting 807const ( 808 SINGULARITY IndentType = iota 809 COMPACT 810 FLUSH 811 INDENT 812 SUBTREE 813 WRAPPED 814) 815 816// OpType is the integer type for operations 817type OpType int 818 819// OpType keys for operations 820const ( 821 UNSET OpType = iota 822 ELEMENT 823 FIRST 824 LAST 825 ENCODE 826 DECODE 827 PLAIN 828 UPPER 829 LOWER 830 CHAIN 831 TITLE 832 ORDER 833 YEAR 834 DOI 835 TRANSLATE 836 REPLACE 837 TERMS 838 WORDS 839 PAIRS 840 REVERSE 841 LETTERS 842 CLAUSES 843 INDICES 844 MESHCODE 845 MATRIX 846 HISTOGRAM 847 ACCENTED 848 PFX 849 SFX 850 SEP 851 TAB 852 RET 853 LBL 854 CLR 855 PFC 856 DEQ 857 PLG 858 ELG 859 FWD 860 AWD 861 WRP 862 ENC 863 PKG 864 RST 865 DEF 866 REG 867 EXP 868 COLOR 869 POSITION 870 SELECT 871 IF 872 UNLESS 873 MATCH 874 AVOID 875 AND 876 OR 877 EQUALS 878 CONTAINS 879 ISWITHIN 880 STARTSWITH 881 ENDSWITH 882 ISNOT 883 ISBEFORE 884 ISAFTER 885 MATCHES 886 RESEMBLES 887 ISEQUALTO 888 DIFFERSFROM 889 GT 890 GE 891 LT 892 LE 893 EQ 894 NE 895 NUM 896 LEN 897 SUM 898 MIN 899 MAX 900 INC 901 DEC 902 SUB 903 AVG 904 DEV 905 MED 906 MUL 907 DIV 908 MOD 909 BIN 910 BIT 911 ZEROBASED 912 ONEBASED 913 UCSCBASED 914 REVCOMP 915 NUCLEIC 916 FASTA 917 NCBI2NA 918 NCBI4NA 919 MOLWT 920 HGVS 921 ELSE 922 VARIABLE 923 ACCUMULATOR 924 VALUE 925 QUESTION 926 STAR 927 DOLLAR 928 ATSIGN 929 COUNT 930 LENGTH 931 DEPTH 932 INDEX 933 UNRECOGNIZED 934) 935 936// ArgumentType is the integer type for argument classification 937type ArgumentType int 938 939// ArgumentType keys for argument classification 940const ( 941 _ ArgumentType = iota 942 EXPLORATION 943 CONDITIONAL 944 EXTRACTION 945 CUSTOMIZATION 946) 947 948// RangeType is the integer type for element range choices 949type RangeType int 950 951// RangeType keys for element range choices 952const ( 953 NORANGE RangeType = iota 954 STRINGRANGE 955 VARIABLERANGE 956 INTEGERRANGE 957) 958 959// SeqEndType is used for -ucsc-based decisions 960type SeqEndType int 961 962// SeqEndType keys for -ucsc-based decisions 963const ( 964 _ SeqEndType = iota 965 ISSTART 966 ISSTOP 967 ISPOS 968) 969 970// SequenceType is used to record XML tag and position for -ucsc-based 971type SequenceType struct { 972 Based int 973 Which SeqEndType 974} 975 976// MUTEXES 977 978var hlock sync.Mutex 979 980var slock sync.RWMutex 981 982// ARGUMENT MAPS 983 984var argTypeIs = map[string]ArgumentType{ 985 "-unit": EXPLORATION, 986 "-Unit": EXPLORATION, 987 "-subset": EXPLORATION, 988 "-Subset": EXPLORATION, 989 "-section": EXPLORATION, 990 "-Section": EXPLORATION, 991 "-block": EXPLORATION, 992 "-Block": EXPLORATION, 993 "-branch": EXPLORATION, 994 "-Branch": EXPLORATION, 995 "-group": EXPLORATION, 996 "-Group": EXPLORATION, 997 "-division": EXPLORATION, 998 "-Division": EXPLORATION, 999 "-path": EXPLORATION, 1000 "-Path": EXPLORATION, 1001 "-pattern": EXPLORATION, 1002 "-Pattern": EXPLORATION, 1003 "-position": CONDITIONAL, 1004 "-select": CONDITIONAL, 1005 "-if": CONDITIONAL, 1006 "-unless": CONDITIONAL, 1007 "-match": CONDITIONAL, 1008 "-avoid": CONDITIONAL, 1009 "-and": CONDITIONAL, 1010 "-or": CONDITIONAL, 1011 "-equals": CONDITIONAL, 1012 "-contains": CONDITIONAL, 1013 "-is-within": CONDITIONAL, 1014 "-starts-with": CONDITIONAL, 1015 "-ends-with": CONDITIONAL, 1016 "-is-not": CONDITIONAL, 1017 "-is-before": CONDITIONAL, 1018 "-is-after": CONDITIONAL, 1019 "-matches": CONDITIONAL, 1020 "-resembles": CONDITIONAL, 1021 "-is-equal-to": CONDITIONAL, 1022 "-differs-from": CONDITIONAL, 1023 "-gt": CONDITIONAL, 1024 "-ge": CONDITIONAL, 1025 "-lt": CONDITIONAL, 1026 "-le": CONDITIONAL, 1027 "-eq": CONDITIONAL, 1028 "-ne": CONDITIONAL, 1029 "-element": EXTRACTION, 1030 "-first": EXTRACTION, 1031 "-last": EXTRACTION, 1032 "-encode": EXTRACTION, 1033 "-decode": EXTRACTION, 1034 "-decode64": EXTRACTION, 1035 "-plain": EXTRACTION, 1036 "-upper": EXTRACTION, 1037 "-lower": EXTRACTION, 1038 "-chain": EXTRACTION, 1039 "-title": EXTRACTION, 1040 "-order": EXTRACTION, 1041 "-year": EXTRACTION, 1042 "-doi": EXTRACTION, 1043 "-translate": EXTRACTION, 1044 "-replace": EXTRACTION, 1045 "-terms": EXTRACTION, 1046 "-words": EXTRACTION, 1047 "-pairs": EXTRACTION, 1048 "-reverse": EXTRACTION, 1049 "-letters": EXTRACTION, 1050 "-clauses": EXTRACTION, 1051 "-indices": EXTRACTION, 1052 "-meshcode": EXTRACTION, 1053 "-matrix": EXTRACTION, 1054 "-histogram": EXTRACTION, 1055 "-accented": EXTRACTION, 1056 "-num": EXTRACTION, 1057 "-len": EXTRACTION, 1058 "-sum": EXTRACTION, 1059 "-min": EXTRACTION, 1060 "-max": EXTRACTION, 1061 "-inc": EXTRACTION, 1062 "-dec": EXTRACTION, 1063 "-sub": EXTRACTION, 1064 "-avg": EXTRACTION, 1065 "-dev": EXTRACTION, 1066 "-med": EXTRACTION, 1067 "-mul": EXTRACTION, 1068 "-div": EXTRACTION, 1069 "-mod": EXTRACTION, 1070 "-bin": EXTRACTION, 1071 "-bit": EXTRACTION, 1072 "-0-based": EXTRACTION, 1073 "-zero-based": EXTRACTION, 1074 "-1-based": EXTRACTION, 1075 "-one-based": EXTRACTION, 1076 "-ucsc": EXTRACTION, 1077 "-ucsc-based": EXTRACTION, 1078 "-ucsc-coords": EXTRACTION, 1079 "-bed-based": EXTRACTION, 1080 "-bed-coords": EXTRACTION, 1081 "-revcomp": EXTRACTION, 1082 "-nucleic": EXTRACTION, 1083 "-fasta": EXTRACTION, 1084 "-ncbi2na": EXTRACTION, 1085 "-ncbi4na": EXTRACTION, 1086 "-molwt": EXTRACTION, 1087 "-hgvs": EXTRACTION, 1088 "-else": EXTRACTION, 1089 "-pfx": CUSTOMIZATION, 1090 "-sfx": CUSTOMIZATION, 1091 "-sep": CUSTOMIZATION, 1092 "-tab": CUSTOMIZATION, 1093 "-ret": CUSTOMIZATION, 1094 "-lbl": CUSTOMIZATION, 1095 "-clr": CUSTOMIZATION, 1096 "-pfc": CUSTOMIZATION, 1097 "-deq": CUSTOMIZATION, 1098 "-plg": CUSTOMIZATION, 1099 "-elg": CUSTOMIZATION, 1100 "-fwd": CUSTOMIZATION, 1101 "-awd": CUSTOMIZATION, 1102 "-wrp": CUSTOMIZATION, 1103 "-enc": CUSTOMIZATION, 1104 "-pkg": CUSTOMIZATION, 1105 "-rst": CUSTOMIZATION, 1106 "-def": CUSTOMIZATION, 1107 "-reg": CUSTOMIZATION, 1108 "-exp": CUSTOMIZATION, 1109 "-color": CUSTOMIZATION, 1110} 1111 1112var opTypeIs = map[string]OpType{ 1113 "-element": ELEMENT, 1114 "-first": FIRST, 1115 "-last": LAST, 1116 "-encode": ENCODE, 1117 "-decode": DECODE, 1118 "-decode64": DECODE, 1119 "-plain": PLAIN, 1120 "-upper": UPPER, 1121 "-lower": LOWER, 1122 "-chain": CHAIN, 1123 "-title": TITLE, 1124 "-order": ORDER, 1125 "-year": YEAR, 1126 "-doi": DOI, 1127 "-translate": TRANSLATE, 1128 "-replace": REPLACE, 1129 "-terms": TERMS, 1130 "-words": WORDS, 1131 "-pairs": PAIRS, 1132 "-reverse": REVERSE, 1133 "-letters": LETTERS, 1134 "-clauses": CLAUSES, 1135 "-indices": INDICES, 1136 "-meshcode": MESHCODE, 1137 "-matrix": MATRIX, 1138 "-histogram": HISTOGRAM, 1139 "-accented": ACCENTED, 1140 "-pfx": PFX, 1141 "-sfx": SFX, 1142 "-sep": SEP, 1143 "-tab": TAB, 1144 "-ret": RET, 1145 "-lbl": LBL, 1146 "-clr": CLR, 1147 "-pfc": PFC, 1148 "-deq": DEQ, 1149 "-plg": PLG, 1150 "-elg": ELG, 1151 "-fwd": FWD, 1152 "-awd": AWD, 1153 "-wrp": WRP, 1154 "-enc": ENC, 1155 "-pkg": PKG, 1156 "-rst": RST, 1157 "-def": DEF, 1158 "-reg": REG, 1159 "-exp": EXP, 1160 "-color": COLOR, 1161 "-position": POSITION, 1162 "-select": SELECT, 1163 "-if": IF, 1164 "-unless": UNLESS, 1165 "-match": MATCH, 1166 "-avoid": AVOID, 1167 "-and": AND, 1168 "-or": OR, 1169 "-equals": EQUALS, 1170 "-contains": CONTAINS, 1171 "-is-within": ISWITHIN, 1172 "-starts-with": STARTSWITH, 1173 "-ends-with": ENDSWITH, 1174 "-is-not": ISNOT, 1175 "-is-before": ISBEFORE, 1176 "-is-after": ISAFTER, 1177 "-matches": MATCHES, 1178 "-resembles": RESEMBLES, 1179 "-is-equal-to": ISEQUALTO, 1180 "-differs-from": DIFFERSFROM, 1181 "-gt": GT, 1182 "-ge": GE, 1183 "-lt": LT, 1184 "-le": LE, 1185 "-eq": EQ, 1186 "-ne": NE, 1187 "-num": NUM, 1188 "-len": LEN, 1189 "-sum": SUM, 1190 "-min": MIN, 1191 "-max": MAX, 1192 "-inc": INC, 1193 "-dec": DEC, 1194 "-sub": SUB, 1195 "-avg": AVG, 1196 "-dev": DEV, 1197 "-med": MED, 1198 "-mul": MUL, 1199 "-div": DIV, 1200 "-mod": MOD, 1201 "-bin": BIN, 1202 "-bit": BIT, 1203 "-0-based": ZEROBASED, 1204 "-zero-based": ZEROBASED, 1205 "-1-based": ONEBASED, 1206 "-one-based": ONEBASED, 1207 "-ucsc": UCSCBASED, 1208 "-ucsc-based": UCSCBASED, 1209 "-ucsc-coords": UCSCBASED, 1210 "-bed-based": UCSCBASED, 1211 "-bed-coords": UCSCBASED, 1212 "-revcomp": REVCOMP, 1213 "-nucleic": NUCLEIC, 1214 "-fasta": FASTA, 1215 "-ncbi2na": NCBI2NA, 1216 "-ncbi4na": NCBI4NA, 1217 "-molwt": MOLWT, 1218 "-hgvs": HGVS, 1219 "-else": ELSE, 1220} 1221 1222var sequenceTypeIs = map[string]SequenceType{ 1223 "INSDSeq:INSDInterval_from": {1, ISSTART}, 1224 "INSDSeq:INSDInterval_to": {1, ISSTOP}, 1225 "DocumentSummary:ChrStart": {0, ISSTART}, 1226 "DocumentSummary:ChrStop": {0, ISSTOP}, 1227 "DocumentSummary:Chr_start": {1, ISSTART}, 1228 "DocumentSummary:Chr_end": {1, ISSTOP}, 1229 "DocumentSummary:Chr_inner_start": {1, ISSTART}, 1230 "DocumentSummary:Chr_inner_end": {1, ISSTOP}, 1231 "DocumentSummary:Chr_outer_start": {1, ISSTART}, 1232 "DocumentSummary:Chr_outer_end": {1, ISSTOP}, 1233 "DocumentSummary:start": {1, ISSTART}, 1234 "DocumentSummary:stop": {1, ISSTOP}, 1235 "DocumentSummary:display_start": {1, ISSTART}, 1236 "DocumentSummary:display_stop": {1, ISSTOP}, 1237 "Entrezgene:Seq-interval_from": {0, ISSTART}, 1238 "Entrezgene:Seq-interval_to": {0, ISSTOP}, 1239 "GenomicInfoType:ChrStart": {0, ISSTART}, 1240 "GenomicInfoType:ChrStop": {0, ISSTOP}, 1241 "RS:position": {0, ISPOS}, 1242 "RS:@asnFrom": {0, ISSTART}, 1243 "RS:@asnTo": {0, ISSTOP}, 1244 "RS:@end": {0, ISSTOP}, 1245 "RS:@leftContigNeighborPos": {0, ISSTART}, 1246 "RS:@physMapInt": {0, ISPOS}, 1247 "RS:@protLoc": {0, ISPOS}, 1248 "RS:@rightContigNeighborPos": {0, ISSTOP}, 1249 "RS:@start": {0, ISSTART}, 1250 "RS:@structLoc": {0, ISPOS}, 1251} 1252 1253/* 1254 var conv = []string{"A", "C", "G", "T"} 1255 for i := 0; i < 4; i++ { 1256 for j := 0; j < 4; j++ { 1257 for k := 0; k < 4; k++ { 1258 for l := 0; l < 4; l++ { 1259 base := conv[i] + conv[j] + conv[k] + conv[l] 1260 idx := i*64 + j*16 + k*4 + l 1261 fmt.Fprintf(os.Stdout, "\t%d: \"%s\",\n", idx, base) 1262 } 1263 } 1264 } 1265 } 1266*/ 1267 1268var ncbi2naToIupac = map[int]string{ 1269 0: "AAAA", 1270 1: "AAAC", 1271 2: "AAAG", 1272 3: "AAAT", 1273 4: "AACA", 1274 5: "AACC", 1275 6: "AACG", 1276 7: "AACT", 1277 8: "AAGA", 1278 9: "AAGC", 1279 10: "AAGG", 1280 11: "AAGT", 1281 12: "AATA", 1282 13: "AATC", 1283 14: "AATG", 1284 15: "AATT", 1285 16: "ACAA", 1286 17: "ACAC", 1287 18: "ACAG", 1288 19: "ACAT", 1289 20: "ACCA", 1290 21: "ACCC", 1291 22: "ACCG", 1292 23: "ACCT", 1293 24: "ACGA", 1294 25: "ACGC", 1295 26: "ACGG", 1296 27: "ACGT", 1297 28: "ACTA", 1298 29: "ACTC", 1299 30: "ACTG", 1300 31: "ACTT", 1301 32: "AGAA", 1302 33: "AGAC", 1303 34: "AGAG", 1304 35: "AGAT", 1305 36: "AGCA", 1306 37: "AGCC", 1307 38: "AGCG", 1308 39: "AGCT", 1309 40: "AGGA", 1310 41: "AGGC", 1311 42: "AGGG", 1312 43: "AGGT", 1313 44: "AGTA", 1314 45: "AGTC", 1315 46: "AGTG", 1316 47: "AGTT", 1317 48: "ATAA", 1318 49: "ATAC", 1319 50: "ATAG", 1320 51: "ATAT", 1321 52: "ATCA", 1322 53: "ATCC", 1323 54: "ATCG", 1324 55: "ATCT", 1325 56: "ATGA", 1326 57: "ATGC", 1327 58: "ATGG", 1328 59: "ATGT", 1329 60: "ATTA", 1330 61: "ATTC", 1331 62: "ATTG", 1332 63: "ATTT", 1333 64: "CAAA", 1334 65: "CAAC", 1335 66: "CAAG", 1336 67: "CAAT", 1337 68: "CACA", 1338 69: "CACC", 1339 70: "CACG", 1340 71: "CACT", 1341 72: "CAGA", 1342 73: "CAGC", 1343 74: "CAGG", 1344 75: "CAGT", 1345 76: "CATA", 1346 77: "CATC", 1347 78: "CATG", 1348 79: "CATT", 1349 80: "CCAA", 1350 81: "CCAC", 1351 82: "CCAG", 1352 83: "CCAT", 1353 84: "CCCA", 1354 85: "CCCC", 1355 86: "CCCG", 1356 87: "CCCT", 1357 88: "CCGA", 1358 89: "CCGC", 1359 90: "CCGG", 1360 91: "CCGT", 1361 92: "CCTA", 1362 93: "CCTC", 1363 94: "CCTG", 1364 95: "CCTT", 1365 96: "CGAA", 1366 97: "CGAC", 1367 98: "CGAG", 1368 99: "CGAT", 1369 100: "CGCA", 1370 101: "CGCC", 1371 102: "CGCG", 1372 103: "CGCT", 1373 104: "CGGA", 1374 105: "CGGC", 1375 106: "CGGG", 1376 107: "CGGT", 1377 108: "CGTA", 1378 109: "CGTC", 1379 110: "CGTG", 1380 111: "CGTT", 1381 112: "CTAA", 1382 113: "CTAC", 1383 114: "CTAG", 1384 115: "CTAT", 1385 116: "CTCA", 1386 117: "CTCC", 1387 118: "CTCG", 1388 119: "CTCT", 1389 120: "CTGA", 1390 121: "CTGC", 1391 122: "CTGG", 1392 123: "CTGT", 1393 124: "CTTA", 1394 125: "CTTC", 1395 126: "CTTG", 1396 127: "CTTT", 1397 128: "GAAA", 1398 129: "GAAC", 1399 130: "GAAG", 1400 131: "GAAT", 1401 132: "GACA", 1402 133: "GACC", 1403 134: "GACG", 1404 135: "GACT", 1405 136: "GAGA", 1406 137: "GAGC", 1407 138: "GAGG", 1408 139: "GAGT", 1409 140: "GATA", 1410 141: "GATC", 1411 142: "GATG", 1412 143: "GATT", 1413 144: "GCAA", 1414 145: "GCAC", 1415 146: "GCAG", 1416 147: "GCAT", 1417 148: "GCCA", 1418 149: "GCCC", 1419 150: "GCCG", 1420 151: "GCCT", 1421 152: "GCGA", 1422 153: "GCGC", 1423 154: "GCGG", 1424 155: "GCGT", 1425 156: "GCTA", 1426 157: "GCTC", 1427 158: "GCTG", 1428 159: "GCTT", 1429 160: "GGAA", 1430 161: "GGAC", 1431 162: "GGAG", 1432 163: "GGAT", 1433 164: "GGCA", 1434 165: "GGCC", 1435 166: "GGCG", 1436 167: "GGCT", 1437 168: "GGGA", 1438 169: "GGGC", 1439 170: "GGGG", 1440 171: "GGGT", 1441 172: "GGTA", 1442 173: "GGTC", 1443 174: "GGTG", 1444 175: "GGTT", 1445 176: "GTAA", 1446 177: "GTAC", 1447 178: "GTAG", 1448 179: "GTAT", 1449 180: "GTCA", 1450 181: "GTCC", 1451 182: "GTCG", 1452 183: "GTCT", 1453 184: "GTGA", 1454 185: "GTGC", 1455 186: "GTGG", 1456 187: "GTGT", 1457 188: "GTTA", 1458 189: "GTTC", 1459 190: "GTTG", 1460 191: "GTTT", 1461 192: "TAAA", 1462 193: "TAAC", 1463 194: "TAAG", 1464 195: "TAAT", 1465 196: "TACA", 1466 197: "TACC", 1467 198: "TACG", 1468 199: "TACT", 1469 200: "TAGA", 1470 201: "TAGC", 1471 202: "TAGG", 1472 203: "TAGT", 1473 204: "TATA", 1474 205: "TATC", 1475 206: "TATG", 1476 207: "TATT", 1477 208: "TCAA", 1478 209: "TCAC", 1479 210: "TCAG", 1480 211: "TCAT", 1481 212: "TCCA", 1482 213: "TCCC", 1483 214: "TCCG", 1484 215: "TCCT", 1485 216: "TCGA", 1486 217: "TCGC", 1487 218: "TCGG", 1488 219: "TCGT", 1489 220: "TCTA", 1490 221: "TCTC", 1491 222: "TCTG", 1492 223: "TCTT", 1493 224: "TGAA", 1494 225: "TGAC", 1495 226: "TGAG", 1496 227: "TGAT", 1497 228: "TGCA", 1498 229: "TGCC", 1499 230: "TGCG", 1500 231: "TGCT", 1501 232: "TGGA", 1502 233: "TGGC", 1503 234: "TGGG", 1504 235: "TGGT", 1505 236: "TGTA", 1506 237: "TGTC", 1507 238: "TGTG", 1508 239: "TGTT", 1509 240: "TTAA", 1510 241: "TTAC", 1511 242: "TTAG", 1512 243: "TTAT", 1513 244: "TTCA", 1514 245: "TTCC", 1515 246: "TTCG", 1516 247: "TTCT", 1517 248: "TTGA", 1518 249: "TTGC", 1519 250: "TTGG", 1520 251: "TTGT", 1521 252: "TTTA", 1522 253: "TTTC", 1523 254: "TTTG", 1524 255: "TTTT", 1525} 1526 1527/* 1528 var conv = []string{"N", "A", "C", "M", "G", "R", "S", "V", "T", "W", "Y", "H", "K", "D", "B", "N"} 1529 for i := 0; i < 16; i++ { 1530 for j := 0; j < 16; j++ { 1531 base := conv[i] + conv[j] 1532 idx := i*16 + j 1533 fmt.Fprintf(os.Stdout, "\t%d: \"%s\",\n", idx, base) 1534 } 1535 } 1536*/ 1537 1538var ncbi4naToIupac = map[int]string{ 1539 0: "NN", 1540 1: "NA", 1541 2: "NC", 1542 3: "NM", 1543 4: "NG", 1544 5: "NR", 1545 6: "NS", 1546 7: "NV", 1547 8: "NT", 1548 9: "NW", 1549 10: "NY", 1550 11: "NH", 1551 12: "NK", 1552 13: "ND", 1553 14: "NB", 1554 15: "NN", 1555 16: "AN", 1556 17: "AA", 1557 18: "AC", 1558 19: "AM", 1559 20: "AG", 1560 21: "AR", 1561 22: "AS", 1562 23: "AV", 1563 24: "AT", 1564 25: "AW", 1565 26: "AY", 1566 27: "AH", 1567 28: "AK", 1568 29: "AD", 1569 30: "AB", 1570 31: "AN", 1571 32: "CN", 1572 33: "CA", 1573 34: "CC", 1574 35: "CM", 1575 36: "CG", 1576 37: "CR", 1577 38: "CS", 1578 39: "CV", 1579 40: "CT", 1580 41: "CW", 1581 42: "CY", 1582 43: "CH", 1583 44: "CK", 1584 45: "CD", 1585 46: "CB", 1586 47: "CN", 1587 48: "MN", 1588 49: "MA", 1589 50: "MC", 1590 51: "MM", 1591 52: "MG", 1592 53: "MR", 1593 54: "MS", 1594 55: "MV", 1595 56: "MT", 1596 57: "MW", 1597 58: "MY", 1598 59: "MH", 1599 60: "MK", 1600 61: "MD", 1601 62: "MB", 1602 63: "MN", 1603 64: "GN", 1604 65: "GA", 1605 66: "GC", 1606 67: "GM", 1607 68: "GG", 1608 69: "GR", 1609 70: "GS", 1610 71: "GV", 1611 72: "GT", 1612 73: "GW", 1613 74: "GY", 1614 75: "GH", 1615 76: "GK", 1616 77: "GD", 1617 78: "GB", 1618 79: "GN", 1619 80: "RN", 1620 81: "RA", 1621 82: "RC", 1622 83: "RM", 1623 84: "RG", 1624 85: "RR", 1625 86: "RS", 1626 87: "RV", 1627 88: "RT", 1628 89: "RW", 1629 90: "RY", 1630 91: "RH", 1631 92: "RK", 1632 93: "RD", 1633 94: "RB", 1634 95: "RN", 1635 96: "SN", 1636 97: "SA", 1637 98: "SC", 1638 99: "SM", 1639 100: "SG", 1640 101: "SR", 1641 102: "SS", 1642 103: "SV", 1643 104: "ST", 1644 105: "SW", 1645 106: "SY", 1646 107: "SH", 1647 108: "SK", 1648 109: "SD", 1649 110: "SB", 1650 111: "SN", 1651 112: "VN", 1652 113: "VA", 1653 114: "VC", 1654 115: "VM", 1655 116: "VG", 1656 117: "VR", 1657 118: "VS", 1658 119: "VV", 1659 120: "VT", 1660 121: "VW", 1661 122: "VY", 1662 123: "VH", 1663 124: "VK", 1664 125: "VD", 1665 126: "VB", 1666 127: "VN", 1667 128: "TN", 1668 129: "TA", 1669 130: "TC", 1670 131: "TM", 1671 132: "TG", 1672 133: "TR", 1673 134: "TS", 1674 135: "TV", 1675 136: "TT", 1676 137: "TW", 1677 138: "TY", 1678 139: "TH", 1679 140: "TK", 1680 141: "TD", 1681 142: "TB", 1682 143: "TN", 1683 144: "WN", 1684 145: "WA", 1685 146: "WC", 1686 147: "WM", 1687 148: "WG", 1688 149: "WR", 1689 150: "WS", 1690 151: "WV", 1691 152: "WT", 1692 153: "WW", 1693 154: "WY", 1694 155: "WH", 1695 156: "WK", 1696 157: "WD", 1697 158: "WB", 1698 159: "WN", 1699 160: "YN", 1700 161: "YA", 1701 162: "YC", 1702 163: "YM", 1703 164: "YG", 1704 165: "YR", 1705 166: "YS", 1706 167: "YV", 1707 168: "YT", 1708 169: "YW", 1709 170: "YY", 1710 171: "YH", 1711 172: "YK", 1712 173: "YD", 1713 174: "YB", 1714 175: "YN", 1715 176: "HN", 1716 177: "HA", 1717 178: "HC", 1718 179: "HM", 1719 180: "HG", 1720 181: "HR", 1721 182: "HS", 1722 183: "HV", 1723 184: "HT", 1724 185: "HW", 1725 186: "HY", 1726 187: "HH", 1727 188: "HK", 1728 189: "HD", 1729 190: "HB", 1730 191: "HN", 1731 192: "KN", 1732 193: "KA", 1733 194: "KC", 1734 195: "KM", 1735 196: "KG", 1736 197: "KR", 1737 198: "KS", 1738 199: "KV", 1739 200: "KT", 1740 201: "KW", 1741 202: "KY", 1742 203: "KH", 1743 204: "KK", 1744 205: "KD", 1745 206: "KB", 1746 207: "KN", 1747 208: "DN", 1748 209: "DA", 1749 210: "DC", 1750 211: "DM", 1751 212: "DG", 1752 213: "DR", 1753 214: "DS", 1754 215: "DV", 1755 216: "DT", 1756 217: "DW", 1757 218: "DY", 1758 219: "DH", 1759 220: "DK", 1760 221: "DD", 1761 222: "DB", 1762 223: "DN", 1763 224: "BN", 1764 225: "BA", 1765 226: "BC", 1766 227: "BM", 1767 228: "BG", 1768 229: "BR", 1769 230: "BS", 1770 231: "BV", 1771 232: "BT", 1772 233: "BW", 1773 234: "BY", 1774 235: "BH", 1775 236: "BK", 1776 237: "BD", 1777 238: "BB", 1778 239: "BN", 1779 240: "NN", 1780 241: "NA", 1781 242: "NC", 1782 243: "NM", 1783 244: "NG", 1784 245: "NR", 1785 246: "NS", 1786 247: "NV", 1787 248: "NT", 1788 249: "NW", 1789 250: "NY", 1790 251: "NH", 1791 252: "NK", 1792 253: "ND", 1793 254: "NB", 1794 255: "NN", 1795} 1796 1797// DATA OBJECTS 1798 1799// Step contains parameters for executing a single command step 1800type Step struct { 1801 Type OpType 1802 Value string 1803 Parent string 1804 Match string 1805 Attrib string 1806 TypL RangeType 1807 StrL string 1808 IntL int 1809 TypR RangeType 1810 StrR string 1811 IntR int 1812 Norm bool 1813 Wild bool 1814} 1815 1816// Operation breaks commands into sequential steps 1817type Operation struct { 1818 Type OpType 1819 Value string 1820 Stages []*Step 1821} 1822 1823// Block contains nested instructions for executing commands 1824type Block struct { 1825 Visit string 1826 Parent string 1827 Match string 1828 Path []string 1829 Working []string 1830 Parsed []string 1831 Position string 1832 Foreword string 1833 Afterword string 1834 Conditions []*Operation 1835 Commands []*Operation 1836 Failure []*Operation 1837 Subtasks []*Block 1838} 1839 1840// Limiter is used for collecting specific nodes (e.g., first and last) 1841type Limiter struct { 1842 Obj *eutils.XMLNode 1843 Idx int 1844 Lvl int 1845} 1846 1847// UTILITIES 1848 1849func hasSpaceOrHyphen(str string) bool { 1850 1851 for _, ch := range str { 1852 if ch == ' ' || ch == '-' { 1853 return true 1854 } 1855 } 1856 1857 return false 1858} 1859 1860func isAllCapsOrDigits(str string) bool { 1861 1862 for _, ch := range str { 1863 if !unicode.IsUpper(ch) && !unicode.IsDigit(ch) { 1864 return false 1865 } 1866 } 1867 1868 return true 1869} 1870 1871// sortStringByWords sorts the individual words in a string 1872func sortStringByWords(str string) string { 1873 1874 str = eutils.RemoveCommaOrSemicolon(str) 1875 1876 // check for multiple words 1877 if hasSpaceOrHyphen(str) { 1878 flds := strings.Fields(str) 1879 sort.Slice(flds, func(i, j int) bool { return flds[i] < flds[j] }) 1880 str = strings.Join(flds, " ") 1881 str = strings.Replace(str, "-", " ", -1) 1882 str = eutils.CompressRunsOfSpaces(str) 1883 str = strings.TrimRight(str, ".?:") 1884 } 1885 1886 return str 1887} 1888 1889func parseFlag(str string) OpType { 1890 1891 op, ok := opTypeIs[str] 1892 if ok { 1893 return op 1894 } 1895 1896 if len(str) > 1 && str[0] == '-' && isAllCapsOrDigits(str[1:]) { 1897 return VARIABLE 1898 } 1899 1900 if len(str) > 2 && strings.HasPrefix(str, "--") && isAllCapsOrDigits(str[2:]) { 1901 return ACCUMULATOR 1902 } 1903 1904 if len(str) > 0 && str[0] == '-' { 1905 return UNRECOGNIZED 1906 } 1907 1908 return UNSET 1909} 1910 1911func parseMarkup(str, cmd string) int { 1912 1913 switch str { 1914 case "fuse", "fused": 1915 return eutils.FUSE 1916 case "space", "spaces": 1917 return eutils.SPACE 1918 case "period", "periods": 1919 return eutils.PERIOD 1920 case "bracket", "brackets": 1921 return eutils.BRACKETS 1922 case "markdown": 1923 return eutils.MARKDOWN 1924 case "slash": 1925 return eutils.SLASH 1926 case "tag", "tags": 1927 return eutils.TAGS 1928 case "terse": 1929 return eutils.TERSE 1930 default: 1931 if str != "" { 1932 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized %s value '%s'\n", cmd, str) 1933 os.Exit(1) 1934 } 1935 } 1936 return eutils.NOMARKUP 1937} 1938 1939// DebugBlock examines structure of parsed arguments (undocumented) 1940/* 1941func DebugBlock(blk *Block, depth int) { 1942 1943 doIndent := func(indt int) { 1944 for i := 1; i < indt; i++ { 1945 fmt.Fprintf(os.Stderr, " ") 1946 } 1947 } 1948 1949 doIndent(depth) 1950 1951 if blk.Visit != "" { 1952 doIndent(depth + 1) 1953 fmt.Fprintf(os.Stderr, "<Visit> %s </Visit>\n", blk.Visit) 1954 } 1955 if len(blk.Parsed) > 0 { 1956 doIndent(depth + 1) 1957 fmt.Fprintf(os.Stderr, "<Parsed>") 1958 for _, str := range blk.Parsed { 1959 fmt.Fprintf(os.Stderr, " %s", str) 1960 } 1961 fmt.Fprintf(os.Stderr, " </Parsed>\n") 1962 } 1963 1964 if len(blk.Subtasks) > 0 { 1965 for _, sub := range blk.Subtasks { 1966 DebugBlock(sub, depth+1) 1967 } 1968 } 1969} 1970*/ 1971 1972// PARSE COMMAND-LINE ARGUMENTS 1973 1974// parseArguments parses nested exploration instruction from command-line arguments 1975func parseArguments(cmdargs []string, pttrn string) *Block { 1976 1977 // different names of exploration control arguments allow multiple levels of nested "for" loops in a linear command line 1978 // (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions) 1979 var ( 1980 lcname = []string{ 1981 "", 1982 "-unit", 1983 "-subset", 1984 "-section", 1985 "-block", 1986 "-branch", 1987 "-group", 1988 "-division", 1989 "-path", 1990 "-pattern", 1991 } 1992 1993 ucname = []string{ 1994 "", 1995 "-Unit", 1996 "-Subset", 1997 "-Section", 1998 "-Block", 1999 "-Branch", 2000 "-Group", 2001 "-Division", 2002 "-Path", 2003 "-Pattern", 2004 } 2005 ) 2006 2007 // parseCommands recursive definition 2008 var parseCommands func(parent *Block, startLevel LevelType) 2009 2010 // parseCommands does initial parsing of exploration command structure 2011 parseCommands = func(parent *Block, startLevel LevelType) { 2012 2013 // find next highest level exploration argument 2014 findNextLevel := func(args []string, level LevelType) (LevelType, string, string) { 2015 2016 if len(args) > 1 { 2017 2018 for { 2019 2020 if level < UNIT { 2021 break 2022 } 2023 2024 lctag := lcname[level] 2025 uctag := ucname[level] 2026 2027 for _, txt := range args { 2028 if txt == lctag || txt == uctag { 2029 return level, lctag, uctag 2030 } 2031 } 2032 2033 level-- 2034 } 2035 } 2036 2037 return 0, "", "" 2038 } 2039 2040 arguments := parent.Working 2041 2042 level, lctag, uctag := findNextLevel(arguments, startLevel) 2043 2044 if level < UNIT { 2045 2046 // break recursion 2047 return 2048 } 2049 2050 // group arguments at a given exploration level 2051 subsetCommands := func(args []string) *Block { 2052 2053 max := len(args) 2054 2055 visit := "" 2056 2057 // extract name of object to visit 2058 if max > 1 { 2059 visit = args[1] 2060 args = args[2:] 2061 max -= 2 2062 } 2063 2064 partition := 0 2065 for cur, str := range args { 2066 2067 // record point of next exploration command 2068 partition = cur + 1 2069 2070 // skip if not a command 2071 if len(str) < 1 || str[0] != '-' { 2072 continue 2073 } 2074 2075 if argTypeIs[str] == EXPLORATION { 2076 partition = cur 2077 break 2078 } 2079 } 2080 2081 // convert slashes (e.g., parent/child construct) to periods (e.g., dotted exploration path) 2082 if strings.Contains(visit, "/") { 2083 if !strings.Contains(visit, ".") { 2084 visit = strings.Replace(visit, "/", ".", -1) 2085 } 2086 } 2087 2088 // parse parent.child or dotted path construct 2089 // colon indicates a namespace prefix in any or all of the components 2090 prnt, rmdr := eutils.SplitInTwoRight(visit, ".") 2091 match, rest := eutils.SplitInTwoLeft(rmdr, ".") 2092 2093 if rest != "" { 2094 2095 // exploration match on first component, then search remainder one level at a time with subsequent components 2096 dirs := strings.Split(rmdr, ".") 2097 2098 // signal with "path" position 2099 return &Block{Visit: visit, Parent: "", Match: prnt, Path: dirs, Position: "path", Parsed: args[0:partition], Working: args[partition:]} 2100 } 2101 2102 // promote arguments parsed at this level 2103 return &Block{Visit: visit, Parent: prnt, Match: match, Parsed: args[0:partition], Working: args[partition:]} 2104 } 2105 2106 cur := 0 2107 2108 // search for positions of current exploration command 2109 2110 for idx, txt := range arguments { 2111 if txt == lctag || txt == uctag { 2112 if idx == 0 { 2113 continue 2114 } 2115 2116 blk := subsetCommands(arguments[cur:idx]) 2117 parseCommands(blk, level-1) 2118 parent.Subtasks = append(parent.Subtasks, blk) 2119 2120 cur = idx 2121 } 2122 } 2123 2124 if cur < len(arguments) { 2125 blk := subsetCommands(arguments[cur:]) 2126 parseCommands(blk, level-1) 2127 parent.Subtasks = append(parent.Subtasks, blk) 2128 } 2129 2130 // clear execution arguments from parent after subsetting 2131 parent.Working = nil 2132 } 2133 2134 // parse optional [min:max], [&VAR:&VAR], or [after|before] range specification 2135 parseRange := func(item, rnge string) (typL RangeType, strL string, intL int, typR RangeType, strR string, intR int) { 2136 2137 typL = NORANGE 2138 typR = NORANGE 2139 strL = "" 2140 strR = "" 2141 intL = 0 2142 intR = 0 2143 2144 if rnge == "" { 2145 // no range specification, return default values 2146 return 2147 } 2148 2149 // check if last character is right square bracket 2150 if !strings.HasSuffix(rnge, "]") { 2151 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized range %s\n", rnge) 2152 os.Exit(1) 2153 } 2154 2155 rnge = strings.TrimSuffix(rnge, "]") 2156 2157 if rnge == "" { 2158 fmt.Fprintf(os.Stderr, "\nERROR: Empty range %s[]\n", item) 2159 os.Exit(1) 2160 } 2161 2162 // check for [after|before] variant 2163 if strings.Contains(rnge, "|") { 2164 2165 strL, strR = eutils.SplitInTwoLeft(rnge, "|") 2166 // spacing matters, so do not call TrimSpace 2167 2168 if strL == "" && strR == "" { 2169 fmt.Fprintf(os.Stderr, "\nERROR: Empty range %s[|]\n", item) 2170 os.Exit(1) 2171 } 2172 2173 typL = STRINGRANGE 2174 typR = STRINGRANGE 2175 2176 // return statement returns named variables 2177 return 2178 } 2179 2180 // otherwise must have colon within brackets 2181 if !strings.Contains(rnge, ":") { 2182 fmt.Fprintf(os.Stderr, "\nERROR: Colon missing in range %s[%s]\n", item, rnge) 2183 os.Exit(1) 2184 } 2185 2186 // split at colon 2187 lft, rgt := eutils.SplitInTwoLeft(rnge, ":") 2188 2189 lft = strings.TrimSpace(lft) 2190 rgt = strings.TrimSpace(rgt) 2191 2192 if lft == "" && rgt == "" { 2193 fmt.Fprintf(os.Stderr, "\nERROR: Empty range %s[:]\n", item) 2194 os.Exit(1) 2195 } 2196 2197 // for variable, parse optional +/- offset suffix 2198 parseOffset := func(str string) (string, int) { 2199 2200 if str == "" || str[0] == ' ' { 2201 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '&%s'\n", str) 2202 os.Exit(1) 2203 } 2204 2205 pls := "" 2206 mns := "" 2207 2208 ofs := 0 2209 2210 // check for &VAR+1 or &VAR-1 integer adjustment 2211 str, pls = eutils.SplitInTwoLeft(str, "+") 2212 str, mns = eutils.SplitInTwoLeft(str, "-") 2213 2214 if pls != "" { 2215 val, err := strconv.Atoi(pls) 2216 if err != nil { 2217 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized range adjustment &%s+%s\n", str, pls) 2218 os.Exit(1) 2219 } 2220 ofs = val 2221 } else if mns != "" { 2222 val, err := strconv.Atoi(mns) 2223 if err != nil { 2224 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized range adjustment &%s-%s\n", str, mns) 2225 os.Exit(1) 2226 } 2227 ofs = -val 2228 } 2229 2230 return str, ofs 2231 } 2232 2233 // parse integer position, 1-based coordinate must be greater than 0 2234 parseInteger := func(str string, mustBePositive bool) int { 2235 if str == "" { 2236 return 0 2237 } 2238 2239 val, err := strconv.Atoi(str) 2240 if err != nil { 2241 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized range component %s[%s:]\n", item, str) 2242 os.Exit(1) 2243 } 2244 if mustBePositive { 2245 if val < 1 { 2246 fmt.Fprintf(os.Stderr, "\nERROR: Range component %s[%s:] must be positive\n", item, str) 2247 os.Exit(1) 2248 } 2249 } else { 2250 if val == 0 { 2251 fmt.Fprintf(os.Stderr, "\nERROR: Range component %s[%s:] must not be zero\n", item, str) 2252 os.Exit(1) 2253 } 2254 } 2255 2256 return val 2257 } 2258 2259 if lft != "" { 2260 if lft[0] == '&' { 2261 lft = lft[1:] 2262 strL, intL = parseOffset(lft) 2263 typL = VARIABLERANGE 2264 } else { 2265 intL = parseInteger(lft, true) 2266 typL = INTEGERRANGE 2267 } 2268 } 2269 2270 if rgt != "" { 2271 if rgt[0] == '&' { 2272 rgt = rgt[1:] 2273 strR, intR = parseOffset(rgt) 2274 typR = VARIABLERANGE 2275 } else { 2276 intR = parseInteger(rgt, false) 2277 typR = INTEGERRANGE 2278 } 2279 } 2280 2281 // return statement required to return named variables 2282 return 2283 } 2284 2285 parseConditionals := func(cmds *Block, arguments []string) []*Operation { 2286 2287 max := len(arguments) 2288 if max < 1 { 2289 return nil 2290 } 2291 2292 // check for missing condition command 2293 txt := arguments[0] 2294 if txt != "-if" && txt != "-unless" && txt != "-select" && txt != "-match" && txt != "-avoid" && txt != "-position" { 2295 fmt.Fprintf(os.Stderr, "\nERROR: Missing -if command before '%s'\n", txt) 2296 os.Exit(1) 2297 } 2298 if txt == "-position" && max > 2 { 2299 fmt.Fprintf(os.Stderr, "\nERROR: Cannot combine -position with -if or -unless commands\n") 2300 os.Exit(1) 2301 } 2302 // check for missing argument after last condition 2303 txt = arguments[max-1] 2304 if len(txt) > 0 && txt[0] == '-' { 2305 fmt.Fprintf(os.Stderr, "\nERROR: Item missing after %s command\n", txt) 2306 os.Exit(1) 2307 } 2308 2309 cond := make([]*Operation, 0, max) 2310 2311 // parse conditional clause into execution step 2312 parseStep := func(op *Operation, elementColonValue bool) { 2313 2314 if op == nil { 2315 return 2316 } 2317 2318 str := op.Value 2319 2320 status := ELEMENT 2321 2322 // isolate and parse optional [min:max], [&VAR:&VAR], or [after|before] range specification 2323 str, rnge := eutils.SplitInTwoLeft(str, "[") 2324 2325 str = strings.TrimSpace(str) 2326 rnge = strings.TrimSpace(rnge) 2327 2328 if str == "" && rnge != "" { 2329 fmt.Fprintf(os.Stderr, "\nERROR: Variable missing in range specification [%s\n", rnge) 2330 os.Exit(1) 2331 } 2332 2333 typL, strL, intL, typR, strR, intR := parseRange(str, rnge) 2334 2335 // check for pound, percent, or caret character at beginning of name 2336 if len(str) > 1 { 2337 switch str[0] { 2338 case '&': 2339 if isAllCapsOrDigits(str[1:]) { 2340 status = VARIABLE 2341 str = str[1:] 2342 } else if strings.Contains(str, ":") { 2343 fmt.Fprintf(os.Stderr, "\nERROR: Unsupported construct '%s', use -if &VARIABLE -equals VALUE instead\n", str) 2344 os.Exit(1) 2345 } else { 2346 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '%s'\n", str) 2347 os.Exit(1) 2348 } 2349 case '#': 2350 status = COUNT 2351 str = str[1:] 2352 case '%': 2353 status = LENGTH 2354 str = str[1:] 2355 case '^': 2356 status = DEPTH 2357 str = str[1:] 2358 default: 2359 } 2360 } else if str == "+" { 2361 status = INDEX 2362 } 2363 2364 // parse parent/element@attribute construct 2365 // colon indicates a namespace prefix in any or all of the components 2366 prnt, match := eutils.SplitInTwoRight(str, "/") 2367 match, attrib := eutils.SplitInTwoLeft(match, "@") 2368 val := "" 2369 2370 // leading colon indicates namespace prefix wildcard 2371 wildcard := false 2372 if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") { 2373 wildcard = true 2374 } 2375 2376 if elementColonValue { 2377 2378 // allow parent/element@attribute:value construct for deprecated -match and -avoid, and for subsequent -and and -or commands 2379 match, val = eutils.SplitInTwoLeft(str, ":") 2380 prnt, match = eutils.SplitInTwoRight(match, "/") 2381 match, attrib = eutils.SplitInTwoLeft(match, "@") 2382 } 2383 2384 norm := true 2385 if rnge != "" { 2386 if typL != NORANGE || typR != NORANGE || strL != "" || strR != "" || intL != 0 || intR != 0 { 2387 norm = false 2388 } 2389 } 2390 2391 tsk := &Step{Type: status, Value: str, Parent: prnt, Match: match, Attrib: attrib, 2392 TypL: typL, StrL: strL, IntL: intL, TypR: typR, StrR: strR, IntR: intR, 2393 Norm: norm, Wild: wildcard} 2394 2395 op.Stages = append(op.Stages, tsk) 2396 2397 // transform old -match "element:value" to -match element -equals value 2398 if val != "" { 2399 tsk := &Step{Type: EQUALS, Value: val} 2400 op.Stages = append(op.Stages, tsk) 2401 } 2402 } 2403 2404 idx := 0 2405 2406 // conditionals should alternate between command and object/value 2407 expectDash := true 2408 last := "" 2409 2410 var op *Operation 2411 2412 // flag to allow element-colon-value for deprecated -match and -avoid commands, otherwise colon is for namespace prefixes 2413 elementColonValue := false 2414 2415 status := UNSET 2416 2417 // parse command strings into operation structure 2418 for idx < max { 2419 str := arguments[idx] 2420 idx++ 2421 2422 // conditionals should alternate between command and object/value 2423 if expectDash { 2424 if len(str) < 1 || str[0] != '-' { 2425 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected '%s' argument after '%s'\n", str, last) 2426 os.Exit(1) 2427 } 2428 expectDash = false 2429 } else { 2430 if len(str) > 0 && str[0] == '-' { 2431 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected '%s' command after '%s'\n", str, last) 2432 os.Exit(1) 2433 } 2434 expectDash = true 2435 } 2436 last = str 2437 2438 switch status { 2439 case UNSET: 2440 status = parseFlag(str) 2441 case POSITION: 2442 if cmds.Position != "" { 2443 fmt.Fprintf(os.Stderr, "\nERROR: -position '%s' conflicts with existing '%s'\n", str, cmds.Position) 2444 os.Exit(1) 2445 } 2446 cmds.Position = str 2447 status = UNSET 2448 case MATCH, AVOID: 2449 elementColonValue = true 2450 fallthrough 2451 case SELECT, IF, UNLESS, AND, OR: 2452 op = &Operation{Type: status, Value: str} 2453 cond = append(cond, op) 2454 parseStep(op, elementColonValue) 2455 status = UNSET 2456 case EQUALS, CONTAINS, ISWITHIN, STARTSWITH, ENDSWITH, ISNOT, ISBEFORE, ISAFTER: 2457 if op != nil { 2458 if len(str) > 1 && str[0] == '\\' { 2459 // first character may be backslash protecting dash (undocumented) 2460 str = str[1:] 2461 } 2462 tsk := &Step{Type: status, Value: str} 2463 op.Stages = append(op.Stages, tsk) 2464 op = nil 2465 } else { 2466 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent string match constraints\n") 2467 os.Exit(1) 2468 } 2469 status = UNSET 2470 case MATCHES: 2471 if op != nil { 2472 if len(str) > 1 && str[0] == '\\' { 2473 // first character may be backslash protecting dash (undocumented) 2474 str = str[1:] 2475 } 2476 str = eutils.RemoveCommaOrSemicolon(str) 2477 tsk := &Step{Type: status, Value: str} 2478 op.Stages = append(op.Stages, tsk) 2479 op = nil 2480 } else { 2481 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent string match constraints\n") 2482 os.Exit(1) 2483 } 2484 status = UNSET 2485 case RESEMBLES: 2486 if op != nil { 2487 if len(str) > 1 && str[0] == '\\' { 2488 // first character may be backslash protecting dash (undocumented) 2489 str = str[1:] 2490 } 2491 str = sortStringByWords(str) 2492 tsk := &Step{Type: status, Value: str} 2493 op.Stages = append(op.Stages, tsk) 2494 op = nil 2495 } else { 2496 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent string match constraints\n") 2497 os.Exit(1) 2498 } 2499 status = UNSET 2500 case ISEQUALTO, DIFFERSFROM: 2501 if op != nil { 2502 if len(str) < 1 { 2503 fmt.Fprintf(os.Stderr, "\nERROR: Empty conditional argument\n") 2504 os.Exit(1) 2505 } 2506 ch := str[0] 2507 // uses element as second argument 2508 orig := str 2509 if ch == '#' || ch == '%' || ch == '^' { 2510 // check for pound, percent, or caret character at beginning of element (undocumented) 2511 str = str[1:] 2512 if len(str) < 1 { 2513 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected conditional constraints\n") 2514 os.Exit(1) 2515 } 2516 ch = str[0] 2517 } 2518 if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') { 2519 prnt, match := eutils.SplitInTwoRight(str, "/") 2520 match, attrib := eutils.SplitInTwoLeft(match, "@") 2521 wildcard := false 2522 if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") { 2523 wildcard = true 2524 } 2525 tsk := &Step{Type: status, Value: orig, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard} 2526 op.Stages = append(op.Stages, tsk) 2527 } else { 2528 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected conditional constraints\n") 2529 os.Exit(1) 2530 } 2531 op = nil 2532 } 2533 status = UNSET 2534 case GT, GE, LT, LE, EQ, NE: 2535 if op != nil { 2536 if len(str) > 1 && str[0] == '\\' { 2537 // first character may be backslash protecting minus sign (undocumented) 2538 str = str[1:] 2539 } 2540 if len(str) < 1 { 2541 fmt.Fprintf(os.Stderr, "\nERROR: Empty numeric match constraints\n") 2542 os.Exit(1) 2543 } 2544 ch := str[0] 2545 if (ch >= '0' && ch <= '9') || ch == '-' || ch == '+' { 2546 // literal numeric constant 2547 tsk := &Step{Type: status, Value: str} 2548 op.Stages = append(op.Stages, tsk) 2549 } else { 2550 // numeric test allows element as second argument 2551 orig := str 2552 if ch == '#' || ch == '%' || ch == '^' { 2553 // check for pound, percent, or caret character at beginning of element (undocumented) 2554 str = str[1:] 2555 if len(str) < 1 { 2556 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected numeric match constraints\n") 2557 os.Exit(1) 2558 } 2559 ch = str[0] 2560 } 2561 if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') { 2562 prnt, match := eutils.SplitInTwoRight(str, "/") 2563 match, attrib := eutils.SplitInTwoLeft(match, "@") 2564 wildcard := false 2565 if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") { 2566 wildcard = true 2567 } 2568 tsk := &Step{Type: status, Value: orig, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard} 2569 op.Stages = append(op.Stages, tsk) 2570 } else { 2571 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected numeric match constraints\n") 2572 os.Exit(1) 2573 } 2574 } 2575 op = nil 2576 } else { 2577 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent numeric match constraints\n") 2578 os.Exit(1) 2579 } 2580 status = UNSET 2581 case UNRECOGNIZED: 2582 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str) 2583 os.Exit(1) 2584 default: 2585 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected argument '%s'\n", str) 2586 os.Exit(1) 2587 } 2588 } 2589 2590 return cond 2591 } 2592 2593 parseExtractions := func(cmds *Block, arguments []string) []*Operation { 2594 2595 max := len(arguments) 2596 if max < 1 { 2597 return nil 2598 } 2599 2600 // check for missing -element (or -first, etc.) command 2601 txt := arguments[0] 2602 if len(txt) < 1 || txt[0] != '-' { 2603 fmt.Fprintf(os.Stderr, "\nERROR: Missing -element command before '%s'\n", txt) 2604 os.Exit(1) 2605 } 2606 // check for missing argument after last -element (or -first, etc.) command 2607 txt = arguments[max-1] 2608 if len(txt) > 0 && txt[0] == '-' { 2609 if txt == "-rst" { 2610 fmt.Fprintf(os.Stderr, "\nERROR: Unexpected position for %s command\n", txt) 2611 os.Exit(1) 2612 } else if txt == "-clr" { 2613 // main loop runs out after trailing -clr, add another so this one will be executed 2614 arguments = append(arguments, "-clr") 2615 max++ 2616 } else if max < 2 || arguments[max-2] != "-lbl" { 2617 fmt.Fprintf(os.Stderr, "\nERROR: Item missing after %s command\n", txt) 2618 os.Exit(1) 2619 } 2620 } 2621 2622 comm := make([]*Operation, 0, max) 2623 2624 // parse next argument 2625 nextStatus := func(str string) OpType { 2626 2627 status := parseFlag(str) 2628 2629 switch status { 2630 case VARIABLE: 2631 op := &Operation{Type: status, Value: str[1:]} 2632 comm = append(comm, op) 2633 status = VALUE 2634 case ACCUMULATOR: 2635 op := &Operation{Type: status, Value: str[2:]} 2636 comm = append(comm, op) 2637 status = VALUE 2638 case CLR, RST: 2639 op := &Operation{Type: status, Value: ""} 2640 comm = append(comm, op) 2641 status = UNSET 2642 case ELEMENT, FIRST, LAST, ENCODE, DECODE, PLAIN, UPPER, LOWER, CHAIN, TITLE, ORDER, YEAR, DOI, TRANSLATE, REPLACE, 2643 TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, HISTOGRAM, ACCENTED, 2644 NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT, ZEROBASED, ONEBASED, UCSCBASED, 2645 REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS: 2646 case TAB, RET, PFX, SFX, SEP, LBL, PFC, DEQ, PLG, ELG, WRP, ENC, DEF, REG, EXP, COLOR: 2647 case FWD, AWD, PKG: 2648 case UNSET: 2649 fmt.Fprintf(os.Stderr, "\nERROR: No -element before '%s'\n", str) 2650 os.Exit(1) 2651 case UNRECOGNIZED: 2652 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str) 2653 os.Exit(1) 2654 default: 2655 fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", str) 2656 os.Exit(1) 2657 } 2658 2659 return status 2660 } 2661 2662 // parse extraction clause into individual steps 2663 parseSteps := func(op *Operation, pttrn string) { 2664 2665 if op == nil { 2666 return 2667 } 2668 2669 stat := op.Type 2670 str := op.Value 2671 2672 // element names combined with commas are treated as a prefix-separator-suffix group 2673 comma := strings.Split(str, ",") 2674 2675 rnge := "" 2676 for _, item := range comma { 2677 status := stat 2678 2679 // isolate and parse optional [min:max], [&VAR:&VAR], or [after|before] range specification 2680 item, rnge = eutils.SplitInTwoLeft(item, "[") 2681 2682 item = strings.TrimSpace(item) 2683 rnge = strings.TrimSpace(rnge) 2684 2685 if item == "" && rnge != "" { 2686 fmt.Fprintf(os.Stderr, "\nERROR: Variable missing in range specification [%s\n", rnge) 2687 os.Exit(1) 2688 } 2689 2690 typL, strL, intL, typR, strR, intR := parseRange(item, rnge) 2691 2692 // check for special character at beginning of name 2693 if len(item) > 1 { 2694 switch item[0] { 2695 case '&': 2696 if isAllCapsOrDigits(item[1:]) { 2697 status = VARIABLE 2698 item = item[1:] 2699 } else { 2700 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '%s'\n", item) 2701 os.Exit(1) 2702 } 2703 case '#': 2704 status = COUNT 2705 item = item[1:] 2706 case '%': 2707 status = LENGTH 2708 item = item[1:] 2709 case '^': 2710 status = DEPTH 2711 item = item[1:] 2712 case '*': 2713 for _, ch := range item { 2714 if ch != '*' { 2715 break 2716 } 2717 } 2718 status = STAR 2719 default: 2720 } 2721 } else { 2722 switch item { 2723 case "?": 2724 status = QUESTION 2725 case "*": 2726 status = STAR 2727 case "$": 2728 status = DOLLAR 2729 case "@": 2730 status = ATSIGN 2731 case "+": 2732 status = INDEX 2733 default: 2734 } 2735 } 2736 2737 // parse parent/element@attribute construct 2738 // colon indicates a namespace prefix in any or all of the components 2739 prnt, match := eutils.SplitInTwoRight(item, "/") 2740 match, attrib := eutils.SplitInTwoLeft(match, "@") 2741 2742 // leading colon indicates namespace prefix wildcard 2743 wildcard := false 2744 if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") { 2745 wildcard = true 2746 } 2747 2748 // sequence coordinate adjustments 2749 switch status { 2750 case ZEROBASED, ONEBASED, UCSCBASED: 2751 seq := pttrn + ":" 2752 if attrib != "" { 2753 seq += "@" 2754 seq += attrib 2755 } else if match != "" { 2756 seq += match 2757 } 2758 // confirm -0-based or -1-based arguments are known sequence position elements or attributes 2759 slock.RLock() 2760 seqtype, ok := sequenceTypeIs[seq] 2761 slock.RUnlock() 2762 if !ok { 2763 fmt.Fprintf(os.Stderr, "\nERROR: Element '%s' is not suitable for sequence coordinate conversion\n", item) 2764 os.Exit(1) 2765 } 2766 switch status { 2767 case ZEROBASED: 2768 status = ELEMENT 2769 // if 1-based coordinates, decrement to get 0-based value 2770 if seqtype.Based == 1 { 2771 status = DEC 2772 } 2773 case ONEBASED: 2774 status = ELEMENT 2775 // if 0-based coordinates, increment to get 1-based value 2776 if seqtype.Based == 0 { 2777 status = INC 2778 } 2779 case UCSCBASED: 2780 status = ELEMENT 2781 // half-open intervals, start is 0-based, stop is 1-based 2782 if seqtype.Based == 0 && seqtype.Which == ISSTOP { 2783 status = INC 2784 } else if seqtype.Based == 1 && seqtype.Which == ISSTART { 2785 status = DEC 2786 } 2787 default: 2788 status = ELEMENT 2789 } 2790 default: 2791 } 2792 2793 norm := true 2794 if rnge != "" { 2795 if typL != NORANGE || typR != NORANGE || strL != "" || strR != "" || intL != 0 || intR != 0 { 2796 norm = false 2797 } 2798 } 2799 2800 tsk := &Step{Type: status, Value: item, Parent: prnt, Match: match, Attrib: attrib, 2801 TypL: typL, StrL: strL, IntL: intL, TypR: typR, StrR: strR, IntR: intR, 2802 Norm: norm, Wild: wildcard} 2803 2804 op.Stages = append(op.Stages, tsk) 2805 } 2806 } 2807 2808 idx := 0 2809 2810 status := UNSET 2811 2812 // parse command strings into operation structure 2813 for idx < max { 2814 str := arguments[idx] 2815 idx++ 2816 2817 if argTypeIs[str] == CONDITIONAL { 2818 fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", str) 2819 os.Exit(1) 2820 } 2821 2822 switch status { 2823 case UNSET: 2824 status = nextStatus(str) 2825 case ELEMENT, FIRST, LAST, ENCODE, DECODE, PLAIN, UPPER, LOWER, CHAIN, TITLE, ORDER, YEAR, DOI, TRANSLATE, REPLACE, 2826 TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, HISTOGRAM, ACCENTED, 2827 NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT, ZEROBASED, ONEBASED, UCSCBASED, 2828 REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS: 2829 for !strings.HasPrefix(str, "-") { 2830 // create one operation per argument, even if under a single -element statement 2831 op := &Operation{Type: status, Value: str} 2832 comm = append(comm, op) 2833 parseSteps(op, pttrn) 2834 if idx >= max { 2835 break 2836 } 2837 str = arguments[idx] 2838 idx++ 2839 } 2840 status = UNSET 2841 if idx < max { 2842 status = nextStatus(str) 2843 } 2844 case TAB, RET, PFX, SFX, SEP, LBL, PFC, DEQ, PLG, ELG, WRP, ENC, DEF, REG, EXP, COLOR: 2845 op := &Operation{Type: status, Value: eutils.ConvertSlash(str)} 2846 comm = append(comm, op) 2847 status = UNSET 2848 case FWD: 2849 cmds.Foreword = eutils.ConvertSlash(str) 2850 status = UNSET 2851 case AWD: 2852 cmds.Afterword = eutils.ConvertSlash(str) 2853 status = UNSET 2854 case PKG: 2855 pkg := eutils.ConvertSlash(str) 2856 cmds.Foreword = "" 2857 cmds.Afterword = "" 2858 if pkg != "" && pkg != "-" { 2859 items := strings.Split(pkg, "/") 2860 for i := 0; i < len(items); i++ { 2861 cmds.Foreword += "<" + items[i] + ">" 2862 } 2863 for i := len(items) - 1; i >= 0; i-- { 2864 cmds.Afterword += "</" + items[i] + ">" 2865 } 2866 } 2867 status = UNSET 2868 case VARIABLE: 2869 op := &Operation{Type: status, Value: str[1:]} 2870 comm = append(comm, op) 2871 status = VALUE 2872 case ACCUMULATOR: 2873 op := &Operation{Type: status, Value: str[2:]} 2874 comm = append(comm, op) 2875 status = VALUE 2876 case VALUE: 2877 op := &Operation{Type: status, Value: str} 2878 comm = append(comm, op) 2879 parseSteps(op, pttrn) 2880 status = UNSET 2881 case UNRECOGNIZED: 2882 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str) 2883 os.Exit(1) 2884 default: 2885 } 2886 } 2887 2888 return comm 2889 } 2890 2891 // parseOperations recursive definition 2892 var parseOperations func(parent *Block) 2893 2894 // parseOperations converts parsed arguments to operations lists 2895 parseOperations = func(parent *Block) { 2896 2897 args := parent.Parsed 2898 2899 partition := 0 2900 for cur, str := range args { 2901 2902 // record junction between conditional and extraction commands 2903 partition = cur + 1 2904 2905 // skip if not a command 2906 if len(str) < 1 || str[0] != '-' { 2907 continue 2908 } 2909 2910 if argTypeIs[str] != CONDITIONAL { 2911 partition = cur 2912 break 2913 } 2914 } 2915 2916 // split arguments into conditional tests and extraction or customization commands 2917 conditionals := args[0:partition] 2918 args = args[partition:] 2919 2920 partition = 0 2921 foundElse := false 2922 for cur, str := range args { 2923 2924 // record junction at -else command 2925 partition = cur + 1 2926 2927 // skip if not a command 2928 if len(str) < 1 || str[0] != '-' { 2929 continue 2930 } 2931 2932 if str == "-else" { 2933 partition = cur 2934 foundElse = true 2935 break 2936 } 2937 } 2938 2939 extractions := args[0:partition] 2940 alternative := args[partition:] 2941 2942 if len(alternative) > 0 && alternative[0] == "-else" { 2943 alternative = alternative[1:] 2944 } 2945 2946 // validate argument structure and convert to operations lists 2947 parent.Conditions = parseConditionals(parent, conditionals) 2948 parent.Commands = parseExtractions(parent, extractions) 2949 parent.Failure = parseExtractions(parent, alternative) 2950 2951 // reality checks on placement of -else command 2952 if foundElse { 2953 if len(conditionals) < 1 { 2954 fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n") 2955 os.Exit(1) 2956 } 2957 if len(alternative) < 1 { 2958 fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n") 2959 os.Exit(1) 2960 } 2961 if len(parent.Subtasks) > 0 { 2962 fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n") 2963 os.Exit(1) 2964 } 2965 } 2966 2967 for _, sub := range parent.Subtasks { 2968 parseOperations(sub) 2969 } 2970 } 2971 2972 // parseArguments 2973 2974 head := &Block{} 2975 2976 for _, txt := range cmdargs { 2977 head.Working = append(head.Working, txt) 2978 } 2979 2980 // initial parsing of exploration command structure 2981 parseCommands(head, PATTERN) 2982 2983 if len(head.Subtasks) != 1 { 2984 return nil 2985 } 2986 2987 // skip past empty placeholder 2988 head = head.Subtasks[0] 2989 2990 // convert command strings to array of operations for faster processing 2991 parseOperations(head) 2992 2993 // check for no -element or multiple -pattern commands 2994 noElement := true 2995 numPatterns := 0 2996 for _, txt := range cmdargs { 2997 if argTypeIs[txt] == EXTRACTION { 2998 noElement = false 2999 } 3000 if txt == "-pattern" || txt == "-Pattern" { 3001 numPatterns++ 3002 } else if txt == "-select" { 3003 noElement = false 3004 head.Position = "select" 3005 } 3006 } 3007 3008 if numPatterns < 1 { 3009 fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n") 3010 os.Exit(1) 3011 } 3012 3013 if numPatterns > 1 { 3014 fmt.Fprintf(os.Stderr, "\nERROR: Only one -pattern command is permitted\n") 3015 os.Exit(1) 3016 } 3017 3018 if noElement { 3019 fmt.Fprintf(os.Stderr, "\nERROR: No -element statement in argument list\n") 3020 os.Exit(1) 3021 } 3022 3023 return head 3024} 3025 3026// printSubtree supports compression styles selected by -element "*" through "****" 3027func printSubtree(node *eutils.XMLNode, style IndentType, printAttrs bool, proc func(string)) { 3028 3029 if node == nil || proc == nil { 3030 return 3031 } 3032 3033 // WRAPPED is SUBTREE plus each attribute on its own line 3034 wrapped := false 3035 if style == WRAPPED { 3036 style = SUBTREE 3037 wrapped = true 3038 } 3039 3040 // INDENT is offset by two spaces to allow for parent tag, SUBTREE is not offset 3041 initial := 1 3042 if style == SUBTREE { 3043 style = INDENT 3044 initial = 0 3045 } 3046 3047 // array to speed up indentation 3048 indentSpaces := []string{ 3049 "", 3050 " ", 3051 " ", 3052 " ", 3053 " ", 3054 " ", 3055 " ", 3056 " ", 3057 " ", 3058 " ", 3059 } 3060 3061 // indent a specified number of spaces 3062 doIndent := func(indt int) { 3063 i := indt 3064 for i > 9 { 3065 proc(" ") 3066 i -= 10 3067 } 3068 if i < 0 { 3069 return 3070 } 3071 proc(indentSpaces[i]) 3072 } 3073 3074 // doSubtree recursive definition 3075 var doSubtree func(*eutils.XMLNode, int) 3076 3077 doSubtree = func(curr *eutils.XMLNode, depth int) { 3078 3079 // suppress if it would be an empty self-closing tag 3080 if !eutils.IsNotJustWhitespace(curr.Attributes) && curr.Contents == "" && curr.Children == nil { 3081 return 3082 } 3083 3084 if style == INDENT { 3085 doIndent(depth) 3086 } 3087 3088 if curr.Name != "" { 3089 proc("<") 3090 proc(curr.Name) 3091 3092 if printAttrs { 3093 3094 attr := strings.TrimSpace(curr.Attributes) 3095 attr = eutils.CompressRunsOfSpaces(attr) 3096 3097 if attr != "" { 3098 3099 if wrapped { 3100 3101 start := 0 3102 idx := 0 3103 3104 attlen := len(attr) 3105 3106 for idx < attlen { 3107 ch := attr[idx] 3108 if ch == '=' { 3109 str := attr[start:idx] 3110 proc("\n") 3111 doIndent(depth) 3112 proc(" ") 3113 proc(str) 3114 // skip past equal sign and leading double quote 3115 idx += 2 3116 start = idx 3117 } else if ch == '"' || ch == '\'' { 3118 str := attr[start:idx] 3119 proc("=\"") 3120 proc(str) 3121 proc("\"") 3122 // skip past trailing double quote and (possible) space 3123 idx += 2 3124 start = idx 3125 } else { 3126 idx++ 3127 } 3128 } 3129 3130 proc("\n") 3131 doIndent(depth) 3132 3133 } else { 3134 3135 proc(" ") 3136 proc(attr) 3137 } 3138 } 3139 } 3140 3141 // see if suitable for for self-closing tag 3142 if curr.Contents == "" && curr.Children == nil { 3143 proc("/>") 3144 if style != COMPACT { 3145 proc("\n") 3146 } 3147 return 3148 } 3149 3150 proc(">") 3151 } 3152 3153 if curr.Contents != "" { 3154 3155 proc(curr.Contents[:]) 3156 3157 } else { 3158 3159 if style != COMPACT { 3160 proc("\n") 3161 } 3162 3163 for chld := curr.Children; chld != nil; chld = chld.Next { 3164 doSubtree(chld, depth+1) 3165 } 3166 3167 if style == INDENT { 3168 i := depth 3169 for i > 9 { 3170 proc(" ") 3171 i -= 10 3172 } 3173 proc(indentSpaces[i]) 3174 } 3175 } 3176 3177 if curr.Name != "" { 3178 proc("<") 3179 proc("/") 3180 proc(curr.Name) 3181 proc(">") 3182 } 3183 3184 if style != COMPACT { 3185 proc("\n") 3186 } 3187 } 3188 3189 doSubtree(node, initial) 3190} 3191 3192var ( 3193 xlock sync.Mutex 3194 replx map[string]*regexp.Regexp 3195) 3196 3197// processClause handles comma-separated -element arguments 3198func processClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, plg, sep, def, reg, exp string, wrp bool, status OpType, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int) (string, bool) { 3199 3200 if curr == nil || stages == nil { 3201 return "", false 3202 } 3203 3204 if replx == nil { 3205 xlock.Lock() 3206 if replx == nil { 3207 replx = make(map[string]*regexp.Regexp) 3208 } 3209 xlock.Unlock() 3210 } 3211 3212 // processElement handles individual -element constructs 3213 processElement := func(acc func(string)) { 3214 3215 if acc == nil { 3216 return 3217 } 3218 3219 // element names combined with commas are treated as a prefix-separator-suffix group 3220 for _, stage := range stages { 3221 3222 stat := stage.Type 3223 item := stage.Value 3224 prnt := stage.Parent 3225 match := stage.Match 3226 attrib := stage.Attrib 3227 typL := stage.TypL 3228 strL := stage.StrL 3229 intL := stage.IntL 3230 typR := stage.TypR 3231 strR := stage.StrR 3232 intR := stage.IntR 3233 norm := stage.Norm 3234 wildcard := stage.Wild 3235 unescape := (stat != INDICES) 3236 3237 // exploreElements is a wrapper for eutils.ExploreElements, obtaining most arguments as closures 3238 exploreElements := func(proc func(string, int)) { 3239 eutils.ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc) 3240 } 3241 3242 // sendSlice applies optional [min:max] range restriction and sends result to accumulator 3243 sendSlice := func(str string) { 3244 3245 // handle usual situation with no range first 3246 if norm { 3247 if wrp { 3248 str = html.EscapeString(str) 3249 } 3250 acc(str) 3251 return 3252 } 3253 3254 // check for [after|before] variant 3255 if typL == STRINGRANGE || typR == STRINGRANGE { 3256 if strL != "" { 3257 // use case-insensitive test 3258 strL = strings.ToUpper(strL) 3259 idx := strings.Index(strings.ToUpper(str), strL) 3260 if idx < 0 { 3261 // specified substring must be present in original string 3262 return 3263 } 3264 ln := len(strL) 3265 // remove leading text 3266 str = str[idx+ln:] 3267 } 3268 if strR != "" { 3269 strR = strings.ToUpper(strR) 3270 idx := strings.Index(strings.ToUpper(str), strR) 3271 if idx < 0 { 3272 // specified substring must be present in remaining string 3273 return 3274 } 3275 // remove trailing text 3276 str = str[:idx] 3277 } 3278 if str != "" { 3279 if wrp { 3280 str = html.EscapeString(str) 3281 } 3282 acc(str) 3283 } 3284 return 3285 } 3286 3287 min := 0 3288 max := 0 3289 3290 // slice arguments use variable value +- adjustment or integer constant 3291 if typL == VARIABLERANGE { 3292 if strL == "" { 3293 return 3294 } 3295 lft, ok := variables[strL] 3296 if !ok { 3297 return 3298 } 3299 val, err := strconv.Atoi(lft) 3300 if err != nil { 3301 return 3302 } 3303 // range argument values are inclusive and 1-based, decrement variable start +- offset to use in slice 3304 min = val + intL - 1 3305 } else if typL == INTEGERRANGE { 3306 // range argument values are inclusive and 1-based, decrement literal start to use in slice 3307 min = intL - 1 3308 } 3309 if typR == VARIABLERANGE { 3310 if strR == "" { 3311 return 3312 } 3313 rgt, ok := variables[strR] 3314 if !ok { 3315 return 3316 } 3317 val, err := strconv.Atoi(rgt) 3318 if err != nil { 3319 return 3320 } 3321 if val+intR < 0 { 3322 // negative value is 1-based inset from end of string (undocumented) 3323 max = len(str) + val + intR + 1 3324 } else { 3325 max = val + intR 3326 } 3327 } else if typR == INTEGERRANGE { 3328 if intR < 0 { 3329 // negative max is inset from end of string (undocumented) 3330 max = len(str) + intR + 1 3331 } else { 3332 max = intR 3333 } 3334 } 3335 3336 doRevComp := false 3337 doUpCase := false 3338 if status == NUCLEIC { 3339 // -nucleic uses direction of range to decide between forward strand or reverse complement 3340 if min+1 > max { 3341 min, max = max-1, min+1 3342 doRevComp = true 3343 } 3344 doUpCase = true 3345 } 3346 3347 // numeric range now calculated, apply slice to string 3348 if min == 0 && max == 0 { 3349 if doRevComp { 3350 str = eutils.ReverseComplement(str) 3351 } 3352 if doUpCase { 3353 str = strings.ToUpper(str) 3354 } 3355 if wrp { 3356 str = html.EscapeString(str) 3357 } 3358 acc(str) 3359 } else if max == 0 { 3360 if min > 0 && min < len(str) { 3361 str = str[min:] 3362 if str != "" { 3363 if doRevComp { 3364 str = eutils.ReverseComplement(str) 3365 } 3366 if doUpCase { 3367 str = strings.ToUpper(str) 3368 } 3369 if wrp { 3370 str = html.EscapeString(str) 3371 } 3372 acc(str) 3373 } 3374 } 3375 } else if min == 0 { 3376 if max > 0 && max <= len(str) { 3377 str = str[:max] 3378 if str != "" { 3379 if doRevComp { 3380 str = eutils.ReverseComplement(str) 3381 } 3382 if doUpCase { 3383 str = strings.ToUpper(str) 3384 } 3385 if wrp { 3386 str = html.EscapeString(str) 3387 } 3388 acc(str) 3389 } 3390 } 3391 } else { 3392 if min < max && min > 0 && max <= len(str) { 3393 str = str[min:max] 3394 if str != "" { 3395 if doRevComp { 3396 str = eutils.ReverseComplement(str) 3397 } 3398 if doUpCase { 3399 str = strings.ToUpper(str) 3400 } 3401 if wrp { 3402 str = html.EscapeString(str) 3403 } 3404 acc(str) 3405 } 3406 } 3407 } 3408 } 3409 3410 switch stat { 3411 case ELEMENT: 3412 exploreElements(func(str string, lvl int) { 3413 if str != "" { 3414 sendSlice(str) 3415 } 3416 }) 3417 case TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, HISTOGRAM, ACCENTED, 3418 VALUE, LEN, SUM, MIN, MAX, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT, 3419 REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS: 3420 exploreElements(func(str string, lvl int) { 3421 if str != "" { 3422 sendSlice(str) 3423 } 3424 }) 3425 case FIRST: 3426 single := "" 3427 3428 exploreElements(func(str string, lvl int) { 3429 if single == "" { 3430 single = str 3431 } 3432 }) 3433 3434 if single != "" { 3435 sendSlice(single) 3436 } 3437 case LAST: 3438 single := "" 3439 3440 exploreElements(func(str string, lvl int) { 3441 single = str 3442 }) 3443 3444 if single != "" { 3445 sendSlice(single) 3446 } 3447 case ENCODE: 3448 exploreElements(func(str string, lvl int) { 3449 if str != "" { 3450 if !wrp { 3451 str = html.EscapeString(str) 3452 } 3453 sendSlice(str) 3454 } 3455 }) 3456 case DECODE: 3457 // superseded by transmute -decode64 (undocumented) 3458 exploreElements(func(str string, lvl int) { 3459 if str != "" { 3460 txt, err := base64.StdEncoding.DecodeString(str) 3461 if err == nil { 3462 sendSlice(string(txt)) 3463 } 3464 } 3465 }) 3466 case PLAIN: 3467 exploreElements(func(str string, lvl int) { 3468 if str != "" { 3469 if eutils.IsNotASCII(str) { 3470 str = eutils.DoAccentTransform(str) 3471 if eutils.HasUnicodeMarkup(str) { 3472 str = eutils.RepairUnicodeMarkup(str, eutils.SPACE) 3473 } 3474 } 3475 if eutils.HasBadSpace(str) { 3476 str = eutils.CleanupBadSpaces(str) 3477 } 3478 if eutils.HasAngleBracket(str) { 3479 str = eutils.RepairTableMarkup(str, eutils.SPACE) 3480 str = eutils.RemoveEmbeddedMarkup(str) 3481 str = eutils.CompressRunsOfSpaces(str) 3482 } 3483 sendSlice(str) 3484 } 3485 }) 3486 case UPPER: 3487 exploreElements(func(str string, lvl int) { 3488 if str != "" { 3489 str = strings.ToUpper(str) 3490 sendSlice(str) 3491 } 3492 }) 3493 case LOWER: 3494 exploreElements(func(str string, lvl int) { 3495 if str != "" { 3496 str = strings.ToLower(str) 3497 sendSlice(str) 3498 } 3499 }) 3500 case CHAIN: 3501 exploreElements(func(str string, lvl int) { 3502 if str != "" { 3503 str = strings.Replace(str, " ", "_", -1) 3504 sendSlice(str) 3505 } 3506 }) 3507 case TITLE: 3508 exploreElements(func(str string, lvl int) { 3509 if str != "" { 3510 str = strings.ToLower(str) 3511 str = strings.Title(str) 3512 sendSlice(str) 3513 } 3514 }) 3515 case ORDER: 3516 exploreElements(func(str string, lvl int) { 3517 if str != "" { 3518 str = sortStringByWords(str) 3519 sendSlice(str) 3520 } 3521 }) 3522 case YEAR: 3523 exploreElements(func(str string, lvl int) { 3524 if str != "" { 3525 words := strings.FieldsFunc(str, func(c rune) bool { 3526 return !unicode.IsDigit(c) 3527 }) 3528 for _, item := range words { 3529 if len(item) == 4 { 3530 sendSlice(item) 3531 // only print first year, e.g., PubDate/MedlineDate "2008 Dec-2009 Jan" 3532 return 3533 } 3534 } 3535 } 3536 }) 3537 case DOI: 3538 exploreElements(func(str string, lvl int) { 3539 if str != "" { 3540 str = strings.TrimPrefix(str, "doi:") 3541 str = strings.TrimSpace(str) 3542 str = strings.TrimPrefix(str, "/") 3543 str = strings.TrimPrefix(str, "https://doi.org/") 3544 str = strings.TrimPrefix(str, "http://dx.doi.org/") 3545 str := url.QueryEscape(str) 3546 sendSlice("https://doi.org/" + str) 3547 } 3548 }) 3549 case TRANSLATE: 3550 exploreElements(func(str string, lvl int) { 3551 if str != "" { 3552 txt, found := transform[str] 3553 if found { 3554 // require successful mapping 3555 sendSlice(txt) 3556 } 3557 } 3558 }) 3559 case REPLACE: 3560 exploreElements(func(str string, lvl int) { 3561 if str != "" && replx != nil { 3562 re, found := replx[str] 3563 if !found { 3564 xlock.Lock() 3565 re, found = replx[str] 3566 if !found { 3567 nw, err := regexp.Compile(reg) 3568 if err == nil { 3569 replx[str] = nw 3570 re = nw 3571 } 3572 } 3573 xlock.Unlock() 3574 } 3575 if re != nil { 3576 txt := re.ReplaceAllString(str, exp) 3577 if txt != "" { 3578 sendSlice(txt) 3579 } 3580 } 3581 } 3582 }) 3583 case VARIABLE, ACCUMULATOR: 3584 // use value of stored variable 3585 val, ok := variables[match] 3586 if ok { 3587 sendSlice(val) 3588 } 3589 case NUM, COUNT: 3590 count := 0 3591 3592 exploreElements(func(str string, lvl int) { 3593 count++ 3594 }) 3595 3596 // number of element objects 3597 val := strconv.Itoa(count) 3598 acc(val) 3599 case LENGTH: 3600 length := 0 3601 3602 exploreElements(func(str string, lvl int) { 3603 length += len(str) 3604 }) 3605 3606 // length of element strings 3607 val := strconv.Itoa(length) 3608 acc(val) 3609 case DEPTH: 3610 exploreElements(func(str string, lvl int) { 3611 // depth of each element in scope 3612 val := strconv.Itoa(lvl) 3613 acc(val) 3614 }) 3615 case INDEX: 3616 // -element "+" prints index of current XML object 3617 val := strconv.Itoa(index) 3618 acc(val) 3619 case INC: 3620 // -inc, or component of -0-based, -1-based, or -ucsc-based 3621 exploreElements(func(str string, lvl int) { 3622 if str != "" { 3623 num, err := strconv.Atoi(str) 3624 if err == nil { 3625 // increment value 3626 num++ 3627 val := strconv.Itoa(num) 3628 acc(val) 3629 } 3630 } 3631 }) 3632 case DEC: 3633 // -dec, or component of -0-based, -1-based, or -ucsc-based 3634 exploreElements(func(str string, lvl int) { 3635 if str != "" { 3636 num, err := strconv.Atoi(str) 3637 if err == nil { 3638 // decrement value 3639 num-- 3640 val := strconv.Itoa(num) 3641 acc(val) 3642 } 3643 } 3644 }) 3645 case QUESTION: 3646 acc(curr.Name) 3647 case STAR: 3648 // -element "*" prints current XML subtree on a single line 3649 style := SINGULARITY 3650 printAttrs := true 3651 3652 for _, ch := range item { 3653 if ch == '*' { 3654 style++ 3655 } else if ch == '@' { 3656 printAttrs = false 3657 } 3658 } 3659 if style > WRAPPED { 3660 style = WRAPPED 3661 } 3662 if style < COMPACT { 3663 style = COMPACT 3664 } 3665 3666 var buffer strings.Builder 3667 3668 printSubtree(curr, style, printAttrs, 3669 func(str string) { 3670 if str != "" { 3671 buffer.WriteString(str) 3672 } 3673 }) 3674 3675 txt := buffer.String() 3676 if txt != "" { 3677 acc(txt) 3678 } 3679 case DOLLAR: 3680 for chld := curr.Children; chld != nil; chld = chld.Next { 3681 acc(chld.Name) 3682 } 3683 case ATSIGN: 3684 if curr.Attributes != "" && curr.Attribs == nil { 3685 curr.Attribs = eutils.ParseAttributes(curr.Attributes) 3686 } 3687 for i := 0; i < len(curr.Attribs)-1; i += 2 { 3688 acc(curr.Attribs[i]) 3689 } 3690 default: 3691 } 3692 } 3693 } 3694 3695 ok := false 3696 3697 // format results in buffer 3698 var buffer strings.Builder 3699 3700 buffer.WriteString(prev) 3701 buffer.WriteString(plg) 3702 buffer.WriteString(pfx) 3703 between := "" 3704 3705 switch status { 3706 case ELEMENT: 3707 processElement(func(str string) { 3708 if str != "" { 3709 ok = true 3710 buffer.WriteString(between) 3711 buffer.WriteString(str) 3712 between = sep 3713 } 3714 }) 3715 case FIRST: 3716 single := "" 3717 3718 processElement(func(str string) { 3719 ok = true 3720 if single == "" { 3721 single = str 3722 } 3723 }) 3724 3725 if single != "" { 3726 buffer.WriteString(between) 3727 buffer.WriteString(single) 3728 between = sep 3729 } 3730 case LAST: 3731 single := "" 3732 3733 processElement(func(str string) { 3734 ok = true 3735 single = str 3736 }) 3737 3738 if single != "" { 3739 buffer.WriteString(between) 3740 buffer.WriteString(single) 3741 between = sep 3742 } 3743 case ENCODE, DECODE, PLAIN, UPPER, LOWER, CHAIN, TITLE, ORDER, YEAR, DOI, TRANSLATE, 3744 REPLACE, VALUE, NUM, INC, DEC, ZEROBASED, ONEBASED, UCSCBASED, NUCLEIC: 3745 processElement(func(str string) { 3746 if str != "" { 3747 ok = true 3748 buffer.WriteString(between) 3749 buffer.WriteString(str) 3750 between = sep 3751 } 3752 }) 3753 case LEN: 3754 length := 0 3755 3756 processElement(func(str string) { 3757 length += len(str) 3758 ok = true 3759 }) 3760 3761 if ok { 3762 // length of element strings 3763 val := strconv.Itoa(length) 3764 buffer.WriteString(between) 3765 buffer.WriteString(val) 3766 between = sep 3767 } 3768 case SUM: 3769 sum := 0 3770 3771 processElement(func(str string) { 3772 value, err := strconv.Atoi(str) 3773 if err == nil { 3774 sum += value 3775 ok = true 3776 } 3777 }) 3778 3779 if ok { 3780 // sum of element values 3781 val := strconv.Itoa(sum) 3782 buffer.WriteString(between) 3783 buffer.WriteString(val) 3784 between = sep 3785 } 3786 case MIN: 3787 min := 0 3788 3789 processElement(func(str string) { 3790 value, err := strconv.Atoi(str) 3791 if err == nil { 3792 if !ok || value < min { 3793 min = value 3794 } 3795 ok = true 3796 } 3797 }) 3798 3799 if ok { 3800 // minimum of element values 3801 val := strconv.Itoa(min) 3802 buffer.WriteString(between) 3803 buffer.WriteString(val) 3804 between = sep 3805 } 3806 case MAX: 3807 max := 0 3808 3809 processElement(func(str string) { 3810 value, err := strconv.Atoi(str) 3811 if err == nil { 3812 if !ok || value > max { 3813 max = value 3814 } 3815 ok = true 3816 } 3817 }) 3818 3819 if ok { 3820 // maximum of element values 3821 val := strconv.Itoa(max) 3822 buffer.WriteString(between) 3823 buffer.WriteString(val) 3824 between = sep 3825 } 3826 case SUB: 3827 first := 0 3828 second := 0 3829 count := 0 3830 3831 processElement(func(str string) { 3832 value, err := strconv.Atoi(str) 3833 if err == nil { 3834 count++ 3835 if count == 1 { 3836 first = value 3837 } else if count == 2 { 3838 second = value 3839 } 3840 } 3841 }) 3842 3843 if count == 2 { 3844 // must have exactly 2 elements 3845 ok = true 3846 // difference of element values 3847 val := strconv.Itoa(first - second) 3848 buffer.WriteString(between) 3849 buffer.WriteString(val) 3850 between = sep 3851 } 3852 case AVG: 3853 sum := 0 3854 count := 0 3855 3856 processElement(func(str string) { 3857 value, err := strconv.Atoi(str) 3858 if err == nil { 3859 sum += value 3860 count++ 3861 ok = true 3862 } 3863 }) 3864 3865 if ok { 3866 // average of element values 3867 avg := int(float64(sum) / float64(count)) 3868 val := strconv.Itoa(avg) 3869 buffer.WriteString(between) 3870 buffer.WriteString(val) 3871 between = sep 3872 } 3873 case DEV: 3874 count := 0 3875 mean := 0.0 3876 m2 := 0.0 3877 3878 processElement(func(str string) { 3879 value, err := strconv.Atoi(str) 3880 if err == nil { 3881 // Welford algorithm for one-pass standard deviation 3882 count++ 3883 x := float64(value) 3884 delta := x - mean 3885 mean += delta / float64(count) 3886 m2 += delta * (x - mean) 3887 } 3888 }) 3889 3890 if count > 1 { 3891 // must have at least 2 elements 3892 ok = true 3893 // standard deviation of element values 3894 vrc := m2 / float64(count-1) 3895 dev := int(math.Sqrt(vrc)) 3896 val := strconv.Itoa(dev) 3897 buffer.WriteString(between) 3898 buffer.WriteString(val) 3899 between = sep 3900 } 3901 case MED: 3902 var arry []int 3903 count := 0 3904 3905 processElement(func(str string) { 3906 value, err := strconv.Atoi(str) 3907 if err == nil { 3908 arry = append(arry, value) 3909 count++ 3910 ok = true 3911 } 3912 }) 3913 3914 if ok { 3915 // median of element values 3916 sort.Slice(arry, func(i, j int) bool { return arry[i] < arry[j] }) 3917 med := arry[count/2] 3918 val := strconv.Itoa(med) 3919 buffer.WriteString(between) 3920 buffer.WriteString(val) 3921 between = sep 3922 } 3923 case MUL: 3924 first := 0 3925 second := 0 3926 count := 0 3927 3928 processElement(func(str string) { 3929 value, err := strconv.Atoi(str) 3930 if err == nil { 3931 count++ 3932 if count == 1 { 3933 first = value 3934 } else if count == 2 { 3935 second = value 3936 } 3937 } 3938 }) 3939 3940 if count == 2 { 3941 // must have exactly 2 elements 3942 ok = true 3943 // product of element values 3944 val := strconv.Itoa(first * second) 3945 buffer.WriteString(between) 3946 buffer.WriteString(val) 3947 between = sep 3948 } 3949 case DIV: 3950 first := 0 3951 second := 0 3952 count := 0 3953 3954 processElement(func(str string) { 3955 value, err := strconv.Atoi(str) 3956 if err == nil { 3957 count++ 3958 if count == 1 { 3959 first = value 3960 } else if count == 2 { 3961 second = value 3962 } 3963 } 3964 }) 3965 3966 if count == 2 { 3967 // must have exactly 2 elements 3968 ok = true 3969 // quotient of element values 3970 val := strconv.Itoa(first / second) 3971 buffer.WriteString(between) 3972 buffer.WriteString(val) 3973 between = sep 3974 } 3975 case MOD: 3976 first := 0 3977 second := 0 3978 count := 0 3979 3980 processElement(func(str string) { 3981 value, err := strconv.Atoi(str) 3982 if err == nil { 3983 count++ 3984 if count == 1 { 3985 first = value 3986 } else if count == 2 { 3987 second = value 3988 } 3989 } 3990 }) 3991 3992 if count == 2 { 3993 // must have exactly 2 elements 3994 ok = true 3995 // modulus of element values 3996 val := strconv.Itoa(first % second) 3997 buffer.WriteString(between) 3998 buffer.WriteString(val) 3999 between = sep 4000 } 4001 case BIN: 4002 processElement(func(str string) { 4003 num, err := strconv.Atoi(str) 4004 if err == nil { 4005 // convert to binary representation 4006 val := strconv.FormatInt(int64(num), 2) 4007 buffer.WriteString(between) 4008 buffer.WriteString(val) 4009 between = sep 4010 ok = true 4011 } 4012 }) 4013 case BIT: 4014 processElement(func(str string) { 4015 num, err := strconv.Atoi(str) 4016 if err == nil { 4017 // Kernighan algorithm for counting set bits 4018 count := 0 4019 for num != 0 { 4020 num &= num - 1 4021 count++ 4022 } 4023 val := strconv.Itoa(count) 4024 buffer.WriteString(between) 4025 buffer.WriteString(val) 4026 between = sep 4027 ok = true 4028 } 4029 }) 4030 case REVCOMP: 4031 processElement(func(str string) { 4032 if str != "" { 4033 ok = true 4034 buffer.WriteString(between) 4035 str = eutils.ReverseComplement(str) 4036 buffer.WriteString(str) 4037 between = sep 4038 } 4039 }) 4040 case FASTA: 4041 processElement(func(str string) { 4042 for str != "" { 4043 mx := len(str) 4044 if mx > 50 { 4045 mx = 50 4046 } 4047 item := str[:mx] 4048 str = str[mx:] 4049 ok = true 4050 buffer.WriteString(between) 4051 buffer.WriteString(item) 4052 between = sep 4053 } 4054 }) 4055 case NCBI2NA: 4056 processElement(func(str string) { 4057 if str != "" { 4058 src := []byte(str) 4059 dst := make([]byte, hex.DecodedLen(len(src))) 4060 n, err := hex.Decode(dst, src) 4061 if err == nil { 4062 dst = dst[:n] 4063 ok = true 4064 buffer.WriteString(between) 4065 for _, byt := range dst { 4066 tmp := ncbi2naToIupac[int(byt)] 4067 buffer.WriteString(tmp) 4068 } 4069 between = sep 4070 } 4071 } 4072 }) 4073 case NCBI4NA: 4074 processElement(func(str string) { 4075 if str != "" { 4076 src := []byte(str) 4077 dst := make([]byte, hex.DecodedLen(len(src))) 4078 n, err := hex.Decode(dst, src) 4079 if err == nil { 4080 dst = dst[:n] 4081 ok = true 4082 buffer.WriteString(between) 4083 for _, byt := range dst { 4084 tmp := ncbi4naToIupac[int(byt)] 4085 buffer.WriteString(tmp) 4086 } 4087 between = sep 4088 } 4089 } 4090 }) 4091 case MOLWT: 4092 processElement(func(str string) { 4093 if str != "" { 4094 ok = true 4095 buffer.WriteString(between) 4096 str = eutils.ProteinWeight(str, true) 4097 buffer.WriteString(str) 4098 between = sep 4099 } 4100 }) 4101 case HGVS: 4102 processElement(func(str string) { 4103 if str != "" { 4104 ok = true 4105 buffer.WriteString(between) 4106 str = eutils.ParseHGVS(str) 4107 buffer.WriteString(str) 4108 between = sep 4109 } 4110 }) 4111 case INDICES: 4112 norm := make(map[string][]string) 4113 stem := make(map[string][]string) 4114 4115 cumulative := 0 4116 4117 // mutex for inverted index 4118 var ilock sync.Mutex 4119 4120 addItem := func(field map[string][]string, term string, position int) { 4121 4122 // protect with mutex 4123 ilock.Lock() 4124 4125 arry, found := field[term] 4126 if !found { 4127 arry = make([]string, 0, 1) 4128 } 4129 arry = append(arry, strconv.Itoa(position)) 4130 field[term] = arry 4131 4132 ilock.Unlock() 4133 } 4134 4135 processElement(func(str string) { 4136 4137 if str == "" { 4138 return 4139 } 4140 4141 if str == "[Not Available]." { 4142 return 4143 } 4144 4145 if eutils.IsNotASCII(str) { 4146 str = eutils.DoAccentTransform(str) 4147 if eutils.HasUnicodeMarkup(str) { 4148 str = eutils.RepairUnicodeMarkup(str, eutils.SPACE) 4149 } 4150 } 4151 4152 str = strings.ToLower(str) 4153 4154 if eutils.HasBadSpace(str) { 4155 str = eutils.CleanupBadSpaces(str) 4156 } 4157 if eutils.HasAngleBracket(str) { 4158 str = eutils.RepairEncodedMarkup(str) 4159 str = eutils.RepairTableMarkup(str, eutils.SPACE) 4160 str = eutils.RepairScriptMarkup(str, eutils.SPACE) 4161 str = eutils.RepairMathMLMarkup(str, eutils.SPACE) 4162 // RemoveEmbeddedMarkup must be called before UnescapeString, which was suppressed in eutils.ExploreElements 4163 str = eutils.RemoveEmbeddedMarkup(str) 4164 } 4165 4166 if eutils.HasAmpOrNotASCII(str) { 4167 str = html.UnescapeString(str) 4168 str = strings.ToLower(str) 4169 } 4170 4171 if eutils.IsNotASCII(str) { 4172 if eutils.HasGreek(str) { 4173 str = eutils.SpellGreek(str) 4174 str = eutils.CompressRunsOfSpaces(str) 4175 } 4176 } 4177 4178 str = strings.Replace(str, "(", " ", -1) 4179 str = strings.Replace(str, ")", " ", -1) 4180 4181 str = strings.Replace(str, "_", " ", -1) 4182 4183 if eutils.HasHyphenOrApostrophe(str) { 4184 str = eutils.FixSpecialCases(str) 4185 } 4186 4187 str = strings.Replace(str, "-", " ", -1) 4188 4189 // remove trailing punctuation from each word 4190 var arry []string 4191 4192 terms := strings.Fields(str) 4193 for _, item := range terms { 4194 max := len(item) 4195 for max > 1 { 4196 ch := item[max-1] 4197 if ch != '.' && ch != ',' && ch != ':' && ch != ';' { 4198 break 4199 } 4200 // trim trailing period, comma, colon, and semicolon 4201 item = item[:max-1] 4202 // continue checking for runs of punctuation at end 4203 max-- 4204 } 4205 if item == "" { 4206 continue 4207 } 4208 arry = append(arry, item) 4209 } 4210 4211 // rejoin into string 4212 cleaned := strings.Join(arry, " ") 4213 4214 // break clauses at punctuation other than space or underscore, and at non-ASCII characters 4215 clauses := strings.FieldsFunc(cleaned, func(c rune) bool { 4216 return (!unicode.IsLetter(c) && !unicode.IsDigit(c)) && c != ' ' && c != '_' || c > 127 4217 }) 4218 4219 // space replaces plus sign to separate runs of unpunctuated words 4220 phrases := strings.Join(clauses, " ") 4221 4222 // break phrases into individual words 4223 words := strings.Fields(phrases) 4224 4225 for _, item := range words { 4226 4227 cumulative++ 4228 4229 // skip at site of punctuation break 4230 if item == "+" { 4231 continue 4232 } 4233 4234 // skip terms that are all digits 4235 if eutils.IsAllDigitsOrPeriod(item) { 4236 continue 4237 } 4238 4239 // optional stop word removal 4240 if deStop && eutils.IsStopWord(item) { 4241 continue 4242 } 4243 4244 // index single normalized term 4245 addItem(norm, item, cumulative) 4246 ok = true 4247 4248 // apply stemming algorithm 4249 item = porter2.Stem(item) 4250 item = strings.TrimSpace(item) 4251 addItem(stem, item, cumulative) 4252 } 4253 4254 // pad to avoid false positive proximity match of words in adjacent paragraphs 4255 rounded := ((cumulative + 99) / 100) * 100 4256 if rounded-cumulative < 20 { 4257 rounded += 100 4258 } 4259 cumulative = rounded 4260 }) 4261 4262 prepareIndices := func(field map[string][]string, label string) { 4263 4264 if len(field) < 1 { 4265 return 4266 } 4267 4268 var arry []string 4269 4270 for item := range field { 4271 arry = append(arry, item) 4272 } 4273 4274 sort.Slice(arry, func(i, j int) bool { return arry[i] < arry[j] }) 4275 4276 last := "" 4277 for _, item := range arry { 4278 item = strings.TrimSpace(item) 4279 if item == "" { 4280 continue 4281 } 4282 if item == last { 4283 // skip duplicate entry 4284 continue 4285 } 4286 buffer.WriteString(" <") 4287 buffer.WriteString(label) 4288 if len(field[item]) > 0 { 4289 buffer.WriteString(" pos=\"") 4290 attr := strings.Join(field[item], ",") 4291 buffer.WriteString(attr) 4292 buffer.WriteString("\"") 4293 } 4294 buffer.WriteString(">") 4295 buffer.WriteString(item) 4296 buffer.WriteString("</") 4297 buffer.WriteString(label) 4298 buffer.WriteString(">\n") 4299 last = item 4300 } 4301 } 4302 4303 if ok { 4304 prepareIndices(norm, "NORM") 4305 prepareIndices(stem, "STEM") 4306 } 4307 case TERMS: 4308 processElement(func(str string) { 4309 if str != "" { 4310 4311 terms := strings.Fields(str) 4312 for _, item := range terms { 4313 max := len(item) 4314 for max > 1 { 4315 ch := item[max-1] 4316 if ch != '.' && ch != ',' && ch != ':' && ch != ';' { 4317 break 4318 } 4319 // trim trailing period, comma, colon, and semicolon 4320 item = item[:max-1] 4321 // continue checking for runs of punctuation at end 4322 max-- 4323 } 4324 if item == "" { 4325 continue 4326 } 4327 ok = true 4328 buffer.WriteString(between) 4329 buffer.WriteString(item) 4330 between = sep 4331 } 4332 } 4333 }) 4334 case WORDS: 4335 processElement(func(str string) { 4336 if str != "" { 4337 4338 words := strings.FieldsFunc(str, func(c rune) bool { 4339 return !unicode.IsLetter(c) && !unicode.IsDigit(c) 4340 }) 4341 for _, item := range words { 4342 item = strings.ToLower(item) 4343 if deStop { 4344 if eutils.IsStopWord(item) { 4345 continue 4346 } 4347 } 4348 if doStem { 4349 item = porter2.Stem(item) 4350 item = strings.TrimSpace(item) 4351 } 4352 if item == "" { 4353 continue 4354 } 4355 ok = true 4356 buffer.WriteString(between) 4357 buffer.WriteString(item) 4358 between = sep 4359 } 4360 } 4361 }) 4362 case PAIRS: 4363 processElement(func(str string) { 4364 if str != "" { 4365 4366 // break clauses at punctuation other than space or underscore, and at non-ASCII characters 4367 clauses := strings.FieldsFunc(str, func(c rune) bool { 4368 return (!unicode.IsLetter(c) && !unicode.IsDigit(c)) && c != ' ' || c > 127 4369 }) 4370 4371 // plus sign separates runs of unpunctuated words 4372 phrases := strings.Join(clauses, " + ") 4373 4374 // break phrases into individual words 4375 words := strings.FieldsFunc(phrases, func(c rune) bool { 4376 return !unicode.IsLetter(c) && !unicode.IsDigit(c) 4377 }) 4378 4379 if len(words) > 1 { 4380 past := "" 4381 for _, item := range words { 4382 if item == "+" { 4383 past = "" 4384 continue 4385 } 4386 item = strings.ToLower(item) 4387 if deStop { 4388 if eutils.IsStopWord(item) { 4389 past = "" 4390 continue 4391 } 4392 } 4393 if doStem { 4394 item = porter2.Stem(item) 4395 item = strings.TrimSpace(item) 4396 } 4397 if item == "" { 4398 past = "" 4399 continue 4400 } 4401 if past != "" { 4402 ok = true 4403 buffer.WriteString(between) 4404 buffer.WriteString(past + " " + item) 4405 between = sep 4406 } 4407 past = item 4408 } 4409 } 4410 } 4411 }) 4412 case REVERSE: 4413 processElement(func(str string) { 4414 if str != "" { 4415 4416 words := strings.FieldsFunc(str, func(c rune) bool { 4417 return !unicode.IsLetter(c) && !unicode.IsDigit(c) 4418 }) 4419 for lf, rt := 0, len(words)-1; lf < rt; lf, rt = lf+1, rt-1 { 4420 words[lf], words[rt] = words[rt], words[lf] 4421 } 4422 for _, item := range words { 4423 item = strings.ToLower(item) 4424 if deStop { 4425 if eutils.IsStopWord(item) { 4426 continue 4427 } 4428 } 4429 if doStem { 4430 item = porter2.Stem(item) 4431 item = strings.TrimSpace(item) 4432 } 4433 if item == "" { 4434 continue 4435 } 4436 ok = true 4437 buffer.WriteString(between) 4438 buffer.WriteString(item) 4439 between = sep 4440 } 4441 } 4442 }) 4443 case LETTERS: 4444 processElement(func(str string) { 4445 if str != "" { 4446 for _, ch := range str { 4447 ok = true 4448 buffer.WriteString(between) 4449 buffer.WriteRune(ch) 4450 between = sep 4451 } 4452 } 4453 }) 4454 case CLAUSES: 4455 processElement(func(str string) { 4456 if str != "" { 4457 4458 clauses := strings.FieldsFunc(str, func(c rune) bool { 4459 return c == '.' || c == ',' || c == ';' || c == ':' 4460 }) 4461 for _, item := range clauses { 4462 item = strings.ToLower(item) 4463 item = strings.TrimSpace(item) 4464 if item == "" { 4465 continue 4466 } 4467 ok = true 4468 buffer.WriteString(between) 4469 buffer.WriteString(item) 4470 between = sep 4471 } 4472 } 4473 }) 4474 case MESHCODE: 4475 var code []string 4476 var tree []string 4477 4478 processElement(func(str string) { 4479 if str != "" { 4480 txt, found := transform[str] 4481 str = strings.ToLower(str) 4482 code = append(code, str) 4483 ok = true 4484 4485 if !found { 4486 return 4487 } 4488 txt = strings.ToLower(txt) 4489 txt = strings.Replace(txt, ".", "_", -1) 4490 codes := strings.FieldsFunc(txt, func(c rune) bool { 4491 return c == ',' 4492 }) 4493 for _, item := range codes { 4494 ch := item[0] 4495 if item == "" { 4496 continue 4497 } 4498 switch ch { 4499 case 'a', 'c', 'd', 'e', 'f', 'g', 'z': 4500 tree = append(tree, item) 4501 default: 4502 } 4503 } 4504 } 4505 }) 4506 4507 if len(code) > 1 { 4508 sort.Slice(code, func(i, j int) bool { return code[i] < code[j] }) 4509 } 4510 if len(tree) > 1 { 4511 sort.Slice(tree, func(i, j int) bool { return tree[i] < tree[j] }) 4512 } 4513 4514 last := "" 4515 for _, item := range code { 4516 if item == last { 4517 // skip duplicate entry 4518 continue 4519 } 4520 buffer.WriteString(" <CODE>") 4521 buffer.WriteString(item) 4522 buffer.WriteString("</CODE>\n") 4523 last = item 4524 } 4525 4526 last = "" 4527 for _, item := range tree { 4528 if item == last { 4529 // skip duplicate entry 4530 continue 4531 } 4532 buffer.WriteString(" <TREE>") 4533 buffer.WriteString(item) 4534 buffer.WriteString("</TREE>\n") 4535 last = item 4536 } 4537 case MATRIX: 4538 var arry []string 4539 4540 processElement(func(str string) { 4541 if str != "" { 4542 txt, found := transform[str] 4543 if found { 4544 str = txt 4545 } 4546 arry = append(arry, str) 4547 ok = true 4548 } 4549 }) 4550 4551 if len(arry) > 1 { 4552 sort.Slice(arry, func(i, j int) bool { return arry[i] < arry[j] }) 4553 4554 for i, frst := range arry { 4555 for j, scnd := range arry { 4556 if i == j { 4557 continue 4558 } 4559 buffer.WriteString(between) 4560 buffer.WriteString(frst) 4561 buffer.WriteString("\t") 4562 buffer.WriteString(scnd) 4563 between = "\n" 4564 } 4565 } 4566 } 4567 case HISTOGRAM: 4568 processElement(func(str string) { 4569 if str != "" { 4570 ok = true 4571 4572 hlock.Lock() 4573 4574 val := histogram[str] 4575 val++ 4576 histogram[str] = val 4577 4578 hlock.Unlock() 4579 } 4580 }) 4581 case ACCENTED: 4582 processElement(func(str string) { 4583 if str != "" { 4584 found := false 4585 for _, ch := range str { 4586 if ch > 127 { 4587 found = true 4588 break 4589 } 4590 } 4591 if found { 4592 ok = true 4593 buffer.WriteString(between) 4594 buffer.WriteString(str) 4595 between = sep 4596 } 4597 } 4598 }) 4599 default: 4600 } 4601 4602 // use default value if nothing written 4603 if !ok && def != "" { 4604 ok = true 4605 buffer.WriteString(def) 4606 } 4607 4608 buffer.WriteString(sfx) 4609 4610 if !ok { 4611 return "", false 4612 } 4613 4614 txt := buffer.String() 4615 4616 return txt, true 4617} 4618 4619// processInstructions performs extraction commands on a subset of XML 4620func processInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) { 4621 4622 if accum == nil { 4623 return tab, ret 4624 } 4625 4626 sep := "\t" 4627 pfx := "" 4628 sfx := "" 4629 plg := "" 4630 elg := "" 4631 lst := "" 4632 4633 def := "" 4634 4635 reg := "" 4636 exp := "" 4637 4638 col := "\t" 4639 lin := "\n" 4640 4641 varname := "" 4642 isAccum := false 4643 4644 wrp := false 4645 4646 plain := true 4647 var currColor *color.Color 4648 4649 // handles color, e.g., -color "red,bold", reset to plain by -color "-" (undocumented) 4650 printInColor := func(str string) { 4651 if plain || currColor == nil { 4652 accum(str) 4653 } else { 4654 tx := currColor.SprintFunc() 4655 tmp := fmt.Sprintf("%s", tx(str)) 4656 accum(tmp) 4657 } 4658 } 4659 4660 // process commands 4661 for _, op := range commands { 4662 4663 str := op.Value 4664 4665 switch op.Type { 4666 case ELEMENT, FIRST, LAST, ENCODE, DECODE, PLAIN, UPPER, LOWER, CHAIN, TITLE, ORDER, YEAR, DOI, TRANSLATE, REPLACE, 4667 TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, ACCENTED, 4668 NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT, ZEROBASED, ONEBASED, UCSCBASED, 4669 REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS: 4670 txt, ok := processClause(curr, op.Stages, mask, tab, pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram) 4671 if ok { 4672 plg = "" 4673 lst = elg 4674 tab = col 4675 ret = lin 4676 if plain { 4677 accum(txt) 4678 } else { 4679 printInColor(txt) 4680 } 4681 } 4682 case HISTOGRAM: 4683 txt, ok := processClause(curr, op.Stages, mask, "", "", "", "", "", "", "", "", wrp, op.Type, index, level, variables, transform, histogram) 4684 if ok { 4685 accum(txt) 4686 } 4687 case TAB: 4688 col = str 4689 case RET: 4690 lin = str 4691 case PFX: 4692 pfx = str 4693 case SFX: 4694 sfx = str 4695 case SEP: 4696 sep = str 4697 case LBL: 4698 lbl := str 4699 accum(tab) 4700 accum(plg) 4701 accum(pfx) 4702 if plain { 4703 accum(lbl) 4704 } else { 4705 printInColor(lbl) 4706 } 4707 accum(sfx) 4708 plg = "" 4709 lst = elg 4710 tab = col 4711 ret = lin 4712 case PFC: 4713 // preface clears previous tab and sets prefix in one command 4714 pfx = str 4715 fallthrough 4716 case CLR: 4717 // clear previous tab after the fact 4718 tab = "" 4719 case DEQ: 4720 // set queued tab after the fact 4721 tab = str 4722 case PLG: 4723 plg = str 4724 case ELG: 4725 elg = str 4726 case WRP: 4727 // shortcut to wrap elements in XML tags 4728 if str == "" || str == "-" { 4729 sep = "\t" 4730 pfx = "" 4731 sfx = "" 4732 plg = "" 4733 elg = "" 4734 wrp = false 4735 break 4736 } 4737 // -wrp with comma-separated arguments is deprecated, but supported for backward compatibility 4738 lft, rgt := eutils.SplitInTwoRight(str, ",") 4739 if lft != "" { 4740 plg = "<" + lft + ">" 4741 elg = "</" + lft + ">" 4742 } 4743 if rgt != "" && rgt != "-" { 4744 pfx = "<" + rgt + ">" 4745 sfx = "</" + rgt + ">" 4746 sep = "</" + rgt + "><" + rgt + ">" 4747 } 4748 wrp = true 4749 case ENC: 4750 // shortcut to mark unexpanded instances with XML tags 4751 plg = "" 4752 elg = "" 4753 if str != "" && str != "-" { 4754 items := strings.Split(str, "/") 4755 for i := 0; i < len(items); i++ { 4756 plg += "<" + items[i] + ">" 4757 } 4758 for i := len(items) - 1; i >= 0; i-- { 4759 elg += "</" + items[i] + ">" 4760 } 4761 } 4762 case RST: 4763 pfx = "" 4764 sfx = "" 4765 plg = "" 4766 elg = "" 4767 sep = "\t" 4768 def = "" 4769 wrp = false 4770 case DEF: 4771 def = str 4772 case REG: 4773 reg = str 4774 case EXP: 4775 exp = str 4776 case COLOR: 4777 currColor = color.New() 4778 if str == "-" || str == "reset" || str == "clear" { 4779 plain = true 4780 break 4781 } 4782 plain = false 4783 items := strings.Split(str, ",") 4784 for _, itm := range items { 4785 switch itm { 4786 case "red": 4787 currColor.Add(color.FgRed) 4788 case "grn", "green": 4789 currColor.Add(color.FgGreen) 4790 case "blu", "blue": 4791 currColor.Add(color.FgBlue) 4792 case "blk", "black": 4793 currColor.Add(color.FgBlack) 4794 case "bld", "bold": 4795 currColor.Add(color.Bold) 4796 case "ital", "italic", "italics": 4797 currColor.Add(color.Italic) 4798 case "blink", "flash": 4799 currColor.Add(color.BlinkSlow) 4800 default: 4801 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized color argument '%s'\n", itm) 4802 os.Exit(1) 4803 } 4804 } 4805 case ACCUMULATOR: 4806 isAccum = true 4807 varname = str 4808 case VARIABLE: 4809 isAccum = false 4810 varname = str 4811 case VALUE: 4812 length := len(str) 4813 if length > 1 && str[0] == '(' && str[length-1] == ')' { 4814 // set variable from literal text inside parentheses, e.g., -COM "(, )" 4815 variables[varname] = str[1 : length-1] 4816 // -if "&VARIABLE" will succeed if set to blank with empty parentheses "()" 4817 } else if str == "" { 4818 // -if "&VARIABLE" will fail if initialized with empty string "" 4819 delete(variables, varname) 4820 } else { 4821 txt, ok := processClause(curr, op.Stages, mask, "", pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram) 4822 if ok { 4823 plg = "" 4824 lst = elg 4825 if isAccum { 4826 if variables[varname] == "" { 4827 variables[varname] = txt 4828 } else { 4829 variables[varname] += sep + txt 4830 } 4831 } else { 4832 variables[varname] = txt 4833 } 4834 } 4835 } 4836 varname = "" 4837 isAccum = false 4838 default: 4839 } 4840 } 4841 4842 if plain { 4843 accum(lst) 4844 } else { 4845 printInColor(lst) 4846 } 4847 4848 return tab, ret 4849} 4850 4851// CONDITIONAL EXECUTION USES -if AND -unless STATEMENT, WITH SUPPORT FOR DEPRECATED -match AND -avoid STATEMENTS 4852 4853// conditionsAreSatisfied tests a set of conditions to determine if extraction should proceed 4854func conditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask string, index, level int, variables map[string]string) bool { 4855 4856 if curr == nil { 4857 return false 4858 } 4859 4860 required := 0 4861 observed := 0 4862 forbidden := 0 4863 isMatch := false 4864 isAvoid := false 4865 4866 // matchFound tests individual conditions 4867 matchFound := func(stages []*Step) bool { 4868 4869 if stages == nil || len(stages) < 1 { 4870 return false 4871 } 4872 4873 stage := stages[0] 4874 4875 var constraint *Step 4876 4877 if len(stages) > 1 { 4878 constraint = stages[1] 4879 } 4880 4881 status := stage.Type 4882 prnt := stage.Parent 4883 match := stage.Match 4884 attrib := stage.Attrib 4885 typL := stage.TypL 4886 strL := stage.StrL 4887 intL := stage.IntL 4888 typR := stage.TypR 4889 strR := stage.StrR 4890 intR := stage.IntR 4891 norm := stage.Norm 4892 wildcard := stage.Wild 4893 unescape := true 4894 4895 found := false 4896 number := "" 4897 4898 // exploreElements is a wrapper for eutils.ExploreElements, obtaining most arguments as closures 4899 exploreElements := func(proc func(string, int)) { 4900 eutils.ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc) 4901 } 4902 4903 // test string or numeric constraints 4904 testConstraint := func(str string) bool { 4905 4906 if str == "" || constraint == nil { 4907 return false 4908 } 4909 4910 val := constraint.Value 4911 stat := constraint.Type 4912 4913 switch stat { 4914 case EQUALS, CONTAINS, ISWITHIN, STARTSWITH, ENDSWITH, ISNOT, ISBEFORE, ISAFTER, MATCHES, RESEMBLES: 4915 // substring test on element values 4916 str = strings.ToUpper(str) 4917 val = strings.ToUpper(val) 4918 4919 switch stat { 4920 case EQUALS: 4921 if str == val { 4922 return true 4923 } 4924 case CONTAINS: 4925 if strings.Contains(str, val) { 4926 return true 4927 } 4928 case ISWITHIN: 4929 if strings.Contains(val, str) { 4930 return true 4931 } 4932 case STARTSWITH: 4933 if strings.HasPrefix(str, val) { 4934 return true 4935 } 4936 case ENDSWITH: 4937 if strings.HasSuffix(str, val) { 4938 return true 4939 } 4940 case ISNOT: 4941 if str != val { 4942 return true 4943 } 4944 case ISBEFORE: 4945 if str < val { 4946 return true 4947 } 4948 case ISAFTER: 4949 if str > val { 4950 return true 4951 } 4952 case MATCHES: 4953 if eutils.RemoveCommaOrSemicolon(str) == strings.ToLower(val) { 4954 return true 4955 } 4956 case RESEMBLES: 4957 if sortStringByWords(str) == strings.ToLower(val) { 4958 return true 4959 } 4960 default: 4961 } 4962 case ISEQUALTO, DIFFERSFROM: 4963 // conditional argument is element specifier 4964 if constraint.Parent != "" || constraint.Match != "" || constraint.Attrib != "" { 4965 ch := val[0] 4966 // pound, percent, and caret prefixes supported (undocumented) 4967 switch ch { 4968 case '#': 4969 count := 0 4970 eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { 4971 count++ 4972 }) 4973 val = strconv.Itoa(count) 4974 case '%': 4975 length := 0 4976 eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { 4977 if stn != "" { 4978 length += len(stn) 4979 } 4980 }) 4981 val = strconv.Itoa(length) 4982 case '^': 4983 depth := 0 4984 eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { 4985 depth = lvl 4986 }) 4987 val = strconv.Itoa(depth) 4988 default: 4989 eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { 4990 if stn != "" { 4991 val = stn 4992 } 4993 }) 4994 } 4995 } 4996 str = strings.ToUpper(str) 4997 val = strings.ToUpper(val) 4998 4999 switch stat { 5000 case ISEQUALTO: 5001 if str == val { 5002 return true 5003 } 5004 case DIFFERSFROM: 5005 if str != val { 5006 return true 5007 } 5008 default: 5009 } 5010 case GT, GE, LT, LE, EQ, NE: 5011 // second argument of numeric test can be element specifier 5012 if constraint.Parent != "" || constraint.Match != "" || constraint.Attrib != "" { 5013 ch := val[0] 5014 // pound, percent, and caret prefixes supported as potentially useful for data QA (undocumented) 5015 switch ch { 5016 case '#': 5017 count := 0 5018 eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { 5019 count++ 5020 }) 5021 val = strconv.Itoa(count) 5022 case '%': 5023 length := 0 5024 eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { 5025 if stn != "" { 5026 length += len(stn) 5027 } 5028 }) 5029 val = strconv.Itoa(length) 5030 case '^': 5031 depth := 0 5032 eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { 5033 depth = lvl 5034 }) 5035 val = strconv.Itoa(depth) 5036 default: 5037 eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) { 5038 if stn != "" { 5039 _, errz := strconv.Atoi(stn) 5040 if errz == nil { 5041 val = stn 5042 } 5043 } 5044 }) 5045 } 5046 } 5047 5048 // numeric tests on element values 5049 x, errx := strconv.Atoi(str) 5050 y, erry := strconv.Atoi(val) 5051 5052 // both arguments must resolve to integers 5053 if errx != nil || erry != nil { 5054 return false 5055 } 5056 5057 switch stat { 5058 case GT: 5059 if x > y { 5060 return true 5061 } 5062 case GE: 5063 if x >= y { 5064 return true 5065 } 5066 case LT: 5067 if x < y { 5068 return true 5069 } 5070 case LE: 5071 if x <= y { 5072 return true 5073 } 5074 case EQ: 5075 if x == y { 5076 return true 5077 } 5078 case NE: 5079 if x != y { 5080 return true 5081 } 5082 default: 5083 } 5084 default: 5085 } 5086 5087 return false 5088 } 5089 5090 // checkConstraint applies optional [min:max] range restriction and sends result to testConstraint 5091 checkConstraint := func(str string) bool { 5092 5093 // handle usual situation with no range first 5094 if norm { 5095 return testConstraint(str) 5096 } 5097 5098 // check for [after|before] variant 5099 if typL == STRINGRANGE || typR == STRINGRANGE { 5100 if strL != "" { 5101 // use case-insensitive test 5102 strL = strings.ToUpper(strL) 5103 idx := strings.Index(strings.ToUpper(str), strL) 5104 if idx < 0 { 5105 // specified substring must be present in original string 5106 return false 5107 } 5108 ln := len(strL) 5109 // remove leading text 5110 str = str[idx+ln:] 5111 } 5112 if strR != "" { 5113 strR = strings.ToUpper(strR) 5114 idx := strings.Index(strings.ToUpper(str), strR) 5115 if idx < 0 { 5116 // specified substring must be present in remaining string 5117 return false 5118 } 5119 // remove trailing text 5120 str = str[:idx] 5121 } 5122 if str != "" { 5123 return testConstraint(str) 5124 } 5125 return false 5126 } 5127 5128 min := 0 5129 max := 0 5130 5131 // slice arguments use variable value +- adjustment or integer constant 5132 if typL == VARIABLERANGE { 5133 if strL == "" { 5134 return false 5135 } 5136 lft, ok := variables[strL] 5137 if !ok { 5138 return false 5139 } 5140 val, err := strconv.Atoi(lft) 5141 if err != nil { 5142 return false 5143 } 5144 // range argument values are inclusive and 1-based, decrement variable start +- offset to use in slice 5145 min = val + intL - 1 5146 } else if typL == INTEGERRANGE { 5147 // range argument values are inclusive and 1-based, decrement literal start to use in slice 5148 min = intL - 1 5149 } 5150 if typR == VARIABLERANGE { 5151 if strR == "" { 5152 return false 5153 } 5154 rgt, ok := variables[strR] 5155 if !ok { 5156 return false 5157 } 5158 val, err := strconv.Atoi(rgt) 5159 if err != nil { 5160 return false 5161 } 5162 if val+intR < 0 { 5163 // negative value is 1-based inset from end of string (undocumented) 5164 max = len(str) + val + intR + 1 5165 } else { 5166 max = val + intR 5167 } 5168 } else if typR == INTEGERRANGE { 5169 if intR < 0 { 5170 // negative max is inset from end of string (undocumented) 5171 max = len(str) + intR + 1 5172 } else { 5173 max = intR 5174 } 5175 } 5176 5177 // numeric range now calculated, apply slice to string 5178 if min == 0 && max == 0 { 5179 return testConstraint(str) 5180 } else if max == 0 { 5181 if min > 0 && min < len(str) { 5182 str = str[min:] 5183 if str != "" { 5184 return testConstraint(str) 5185 } 5186 } 5187 } else if min == 0 { 5188 if max > 0 && max <= len(str) { 5189 str = str[:max] 5190 if str != "" { 5191 return testConstraint(str) 5192 } 5193 } 5194 } else { 5195 if min < max && min > 0 && max <= len(str) { 5196 str = str[min:max] 5197 if str != "" { 5198 return testConstraint(str) 5199 } 5200 } 5201 } 5202 5203 return false 5204 } 5205 5206 switch status { 5207 case ELEMENT: 5208 exploreElements(func(str string, lvl int) { 5209 // match to XML container object sends empty string, so do not check for str != "" here 5210 // test every selected element individually if value is specified 5211 if constraint == nil || checkConstraint(str) { 5212 found = true 5213 } 5214 }) 5215 case VARIABLE: 5216 // use value of stored variable 5217 str, ok := variables[match] 5218 if ok { 5219 // -if &VARIABLE -equals VALUE is the supported construct 5220 if constraint == nil || checkConstraint(str) { 5221 found = true 5222 } 5223 } 5224 case COUNT: 5225 count := 0 5226 5227 exploreElements(func(str string, lvl int) { 5228 count++ 5229 found = true 5230 }) 5231 5232 // number of element objects 5233 number = strconv.Itoa(count) 5234 case LENGTH: 5235 length := 0 5236 5237 exploreElements(func(str string, lvl int) { 5238 length += len(str) 5239 found = true 5240 }) 5241 5242 // length of element strings 5243 number = strconv.Itoa(length) 5244 case DEPTH: 5245 depth := 0 5246 5247 exploreElements(func(str string, lvl int) { 5248 depth = lvl 5249 found = true 5250 }) 5251 5252 // depth of last element in scope 5253 number = strconv.Itoa(depth) 5254 case INDEX: 5255 // index of explored parent object 5256 number = strconv.Itoa(index) 5257 found = true 5258 default: 5259 } 5260 5261 if number == "" { 5262 return found 5263 } 5264 5265 if constraint == nil || checkConstraint(number) { 5266 return true 5267 } 5268 5269 return false 5270 } 5271 5272 // test conditional arguments 5273 for _, op := range conditions { 5274 5275 switch op.Type { 5276 // -if tests for presence of element (deprecated -match can test element:value) 5277 case SELECT, IF, MATCH: 5278 // checking for failure here allows for multiple -if [ -and / -or ] clauses 5279 if isMatch && observed < required { 5280 return false 5281 } 5282 if isAvoid && forbidden > 0 { 5283 return false 5284 } 5285 required = 0 5286 observed = 0 5287 forbidden = 0 5288 isMatch = true 5289 isAvoid = false 5290 // continue on to next two cases 5291 fallthrough 5292 case AND: 5293 required++ 5294 // continue on to next case 5295 fallthrough 5296 case OR: 5297 if matchFound(op.Stages) { 5298 observed++ 5299 // record presence of forbidden element if in -unless clause 5300 forbidden++ 5301 } 5302 // -unless tests for absence of element, or presence but with failure of subsequent value test (deprecated -avoid can test element:value) 5303 case UNLESS, AVOID: 5304 if isMatch && observed < required { 5305 return false 5306 } 5307 if isAvoid && forbidden > 0 { 5308 return false 5309 } 5310 required = 0 5311 observed = 0 5312 forbidden = 0 5313 isMatch = false 5314 isAvoid = true 5315 if matchFound(op.Stages) { 5316 forbidden++ 5317 } 5318 default: 5319 } 5320 } 5321 5322 if isMatch && observed < required { 5323 return false 5324 } 5325 if isAvoid && forbidden > 0 { 5326 return false 5327 } 5328 5329 return true 5330} 5331 5332// RECURSIVELY PROCESS EXPLORATION COMMANDS AND XML DATA STRUCTURE 5333 5334// processCommands visits XML nodes, performs conditional tests, and executes data extraction instructions 5335func processCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) { 5336 5337 if accum == nil { 5338 return tab, ret 5339 } 5340 5341 prnt := cmds.Parent 5342 match := cmds.Match 5343 5344 // leading colon indicates namespace prefix wildcard 5345 wildcard := false 5346 if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") { 5347 wildcard = true 5348 } 5349 5350 // **/Object performs deep exploration of recursive data 5351 deep := false 5352 if prnt == "**" { 5353 prnt = "*" 5354 deep = true 5355 } 5356 // Object/** performs exhaustive exploration of nodes 5357 tall := false 5358 if match == "**" { 5359 match = "*" 5360 tall = true 5361 } 5362 5363 // closure passes local variables to callback, which can modify caller tab and ret values 5364 processNode := func(node *eutils.XMLNode, idx, lvl int) { 5365 5366 // apply -if or -unless tests 5367 if conditionsAreSatisfied(cmds.Conditions, node, match, idx, lvl, variables) { 5368 5369 // execute data extraction commands 5370 if len(cmds.Commands) > 0 { 5371 tab, ret = processInstructions(cmds.Commands, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum) 5372 } 5373 5374 // process sub commands on child node 5375 for _, sub := range cmds.Subtasks { 5376 tab, ret = processCommands(sub, node, tab, ret, 1, lvl, variables, transform, histogram, accum) 5377 } 5378 5379 } else { 5380 5381 // execute commands after -else statement 5382 if len(cmds.Failure) > 0 { 5383 tab, ret = processInstructions(cmds.Failure, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum) 5384 } 5385 } 5386 } 5387 5388 // exploreNodes recursive definition 5389 var exploreNodes func(*eutils.XMLNode, int, int, bool, func(*eutils.XMLNode, int, int)) int 5390 5391 // exploreNodes visits all nodes that match the selection criteria 5392 exploreNodes = func(curr *eutils.XMLNode, indx, levl int, force bool, proc func(*eutils.XMLNode, int, int)) int { 5393 5394 if curr == nil || proc == nil { 5395 return indx 5396 } 5397 5398 // match is "*" for heterogeneous data constructs, e.g., -group PubmedArticleSet/* 5399 // wildcard matches any namespace prefix 5400 if curr.Name == match || 5401 match == "*" || 5402 (wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) { 5403 5404 if prnt == "" || 5405 curr.Parent == prnt || 5406 force || 5407 (wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) { 5408 5409 proc(curr, indx, levl) 5410 indx++ 5411 5412 if tall && prnt != "" { 5413 // exhaustive exploration of child nodes within region of parent match 5414 for chld := curr.Children; chld != nil; chld = chld.Next { 5415 indx = exploreNodes(chld, indx, levl+1, true, proc) 5416 } 5417 } 5418 5419 if !deep { 5420 // do not explore within recursive object 5421 return indx 5422 } 5423 } 5424 } 5425 5426 // clearing prnt "*" now allows nested exploration within recursive data, e.g., -pattern Taxon -block */Taxon 5427 if prnt == "*" { 5428 prnt = "" 5429 } 5430 5431 // explore child nodes 5432 for chld := curr.Children; chld != nil; chld = chld.Next { 5433 indx = exploreNodes(chld, indx, levl+1, false, proc) 5434 } 5435 5436 return indx 5437 } 5438 5439 // explorePath recursive definition 5440 var explorePath func(*eutils.XMLNode, []string, int, int, func(*eutils.XMLNode, int, int)) int 5441 5442 // explorePath visits child nodes and matches against next entry in path 5443 explorePath = func(curr *eutils.XMLNode, path []string, indx, levl int, proc func(*eutils.XMLNode, int, int)) int { 5444 5445 if curr == nil || proc == nil { 5446 return indx 5447 } 5448 5449 if len(path) < 1 { 5450 proc(curr, indx, levl) 5451 indx++ 5452 return indx 5453 } 5454 5455 name := path[0] 5456 rest := path[1:] 5457 5458 // explore next level of child nodes 5459 for chld := curr.Children; chld != nil; chld = chld.Next { 5460 if chld.Name == name { 5461 // recurse only if child matches next component in path 5462 indx = explorePath(chld, rest, indx, levl+1, proc) 5463 } 5464 } 5465 5466 return indx 5467 } 5468 5469 if cmds.Foreword != "" { 5470 accum(cmds.Foreword) 5471 } 5472 5473 // apply -position test 5474 5475 if cmds.Position == "" || cmds.Position == "all" { 5476 5477 exploreNodes(curr, index, level, false, processNode) 5478 5479 } else if cmds.Position == "path" { 5480 5481 exploreNodes(curr, index, level, false, 5482 func(node *eutils.XMLNode, idx, lvl int) { 5483 // exploreNodes callback has matched first path component, now explore remainder one level and component at a time 5484 explorePath(node, cmds.Path, idx, lvl, processNode) 5485 }) 5486 5487 } else { 5488 5489 var single *eutils.XMLNode 5490 lev := 0 5491 ind := 0 5492 5493 if cmds.Position == "first" { 5494 5495 exploreNodes(curr, index, level, false, 5496 func(node *eutils.XMLNode, idx, lvl int) { 5497 if single == nil { 5498 single = node 5499 ind = idx 5500 lev = lvl 5501 } 5502 }) 5503 5504 } else if cmds.Position == "last" { 5505 5506 exploreNodes(curr, index, level, false, 5507 func(node *eutils.XMLNode, idx, lvl int) { 5508 single = node 5509 ind = idx 5510 lev = lvl 5511 }) 5512 5513 } else if cmds.Position == "outer" { 5514 5515 // print only first and last nodes 5516 var beg *Limiter 5517 var end *Limiter 5518 5519 exploreNodes(curr, index, level, false, 5520 func(node *eutils.XMLNode, idx, lvl int) { 5521 if beg == nil { 5522 beg = &Limiter{node, idx, lvl} 5523 } else { 5524 end = &Limiter{node, idx, lvl} 5525 } 5526 }) 5527 5528 if beg != nil { 5529 processNode(beg.Obj, beg.Idx, beg.Lvl) 5530 } 5531 if end != nil { 5532 processNode(end.Obj, end.Idx, end.Lvl) 5533 } 5534 5535 } else if cmds.Position == "inner" { 5536 5537 // print all but first and last nodes 5538 var prev *Limiter 5539 var next *Limiter 5540 first := true 5541 5542 exploreNodes(curr, index, level, false, 5543 func(node *eutils.XMLNode, idx, lvl int) { 5544 if first { 5545 first = false 5546 return 5547 } 5548 5549 prev = next 5550 next = &Limiter{node, idx, lvl} 5551 5552 if prev != nil { 5553 processNode(prev.Obj, prev.Idx, prev.Lvl) 5554 } 5555 }) 5556 5557 } else if cmds.Position == "even" { 5558 5559 okay := false 5560 5561 exploreNodes(curr, index, level, false, 5562 func(node *eutils.XMLNode, idx, lvl int) { 5563 if okay { 5564 processNode(node, idx, lvl) 5565 } 5566 okay = !okay 5567 }) 5568 5569 } else if cmds.Position == "odd" { 5570 5571 okay := true 5572 5573 exploreNodes(curr, index, level, false, 5574 func(node *eutils.XMLNode, idx, lvl int) { 5575 if okay { 5576 processNode(node, idx, lvl) 5577 } 5578 okay = !okay 5579 }) 5580 5581 } else { 5582 5583 // use numeric position 5584 number, err := strconv.Atoi(cmds.Position) 5585 if err == nil { 5586 5587 pos := 0 5588 5589 exploreNodes(curr, index, level, false, 5590 func(node *eutils.XMLNode, idx, lvl int) { 5591 pos++ 5592 if pos == number { 5593 single = node 5594 ind = idx 5595 lev = lvl 5596 } 5597 }) 5598 5599 } else { 5600 5601 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized position '%s'\n", cmds.Position) 5602 os.Exit(1) 5603 } 5604 } 5605 5606 if single != nil { 5607 processNode(single, ind, lev) 5608 } 5609 } 5610 5611 if cmds.Afterword != "" { 5612 accum(cmds.Afterword) 5613 } 5614 5615 return tab, ret 5616} 5617 5618// PROCESS ONE XML COMPONENT RECORD 5619 5620// processQuery perform data extraction driven by command-line arguments 5621func processQuery(text, parent string, index int, hd, tl string, transform map[string]string, histogram map[string]int, cmds *Block) string { 5622 5623 if text == "" || cmds == nil { 5624 return "" 5625 } 5626 5627 // exit from function will collect garbage of node structure for current XML object 5628 pat := eutils.ParseRecord(text, parent) 5629 5630 if pat == nil { 5631 return "" 5632 } 5633 5634 // exit from function will also free map of recorded variables for current -pattern 5635 variables := make(map[string]string) 5636 5637 var buffer strings.Builder 5638 5639 ok := false 5640 5641 if hd != "" { 5642 buffer.WriteString(hd) 5643 } 5644 5645 ret := "" 5646 5647 if cmds.Position == "select" { 5648 5649 if conditionsAreSatisfied(cmds.Conditions, pat, cmds.Match, index, 1, variables) { 5650 ok = true 5651 buffer.WriteString(text) 5652 ret = "\n" 5653 } 5654 5655 } else { 5656 5657 // start processing at top of command tree and top of XML subregion selected by -pattern 5658 _, ret = processCommands(cmds, pat, "", "", index, 1, variables, transform, histogram, 5659 func(str string) { 5660 if str != "" { 5661 ok = true 5662 buffer.WriteString(str) 5663 } 5664 }) 5665 } 5666 5667 if tl != "" { 5668 buffer.WriteString(tl) 5669 } 5670 5671 if ret != "" { 5672 ok = true 5673 buffer.WriteString(ret) 5674 } 5675 5676 txt := buffer.String() 5677 5678 // remove leading newline (-insd -pfx artifact) 5679 if txt != "" && txt[0] == '\n' { 5680 txt = txt[1:] 5681 } 5682 5683 if !ok { 5684 return "" 5685 } 5686 5687 // return consolidated result string 5688 return txt 5689} 5690 5691// INSDSEQ EXTRACTION COMMAND GENERATOR 5692 5693// e.g., xtract -insd complete mat_peptide "%peptide" product peptide 5694 5695// processINSD generates extraction commands for GenBank/RefSeq records in INSDSet format 5696func processINSD(args []string, isPipe, addDash, doIndex bool) []string { 5697 5698 // legal GenBank / GenPept / RefSeq features 5699 5700 features := []string{ 5701 "-10_signal", 5702 "-35_signal", 5703 "3'clip", 5704 "3'UTR", 5705 "5'clip", 5706 "5'UTR", 5707 "allele", 5708 "assembly_gap", 5709 "attenuator", 5710 "Bond", 5711 "C_region", 5712 "CAAT_signal", 5713 "CDS", 5714 "centromere", 5715 "conflict", 5716 "D_segment", 5717 "D-loop", 5718 "enhancer", 5719 "exon", 5720 "gap", 5721 "GC_signal", 5722 "gene", 5723 "iDNA", 5724 "intron", 5725 "J_segment", 5726 "LTR", 5727 "mat_peptide", 5728 "misc_binding", 5729 "misc_difference", 5730 "misc_feature", 5731 "misc_recomb", 5732 "misc_RNA", 5733 "misc_signal", 5734 "misc_structure", 5735 "mobile_element", 5736 "modified_base", 5737 "mRNA", 5738 "mutation", 5739 "N_region", 5740 "ncRNA", 5741 "old_sequence", 5742 "operon", 5743 "oriT", 5744 "polyA_signal", 5745 "polyA_site", 5746 "precursor_RNA", 5747 "prim_transcript", 5748 "primer_bind", 5749 "promoter", 5750 "propeptide", 5751 "protein_bind", 5752 "Protein", 5753 "RBS", 5754 "Region", 5755 "regulatory", 5756 "rep_origin", 5757 "repeat_region", 5758 "repeat_unit", 5759 "rRNA", 5760 "S_region", 5761 "satellite", 5762 "scRNA", 5763 "sig_peptide", 5764 "Site", 5765 "snoRNA", 5766 "snRNA", 5767 "source", 5768 "stem_loop", 5769 "STS", 5770 "TATA_signal", 5771 "telomere", 5772 "terminator", 5773 "tmRNA", 5774 "transit_peptide", 5775 "tRNA", 5776 "unsure", 5777 "V_region", 5778 "V_segment", 5779 "variation", 5780 } 5781 5782 // legal GenBank / GenPept / RefSeq qualifiers 5783 5784 qualifiers := []string{ 5785 "allele", 5786 "altitude", 5787 "anticodon", 5788 "artificial_location", 5789 "bio_material", 5790 "bond_type", 5791 "bound_moiety", 5792 "breed", 5793 "calculated_mol_wt", 5794 "cell_line", 5795 "cell_type", 5796 "chloroplast", 5797 "chromoplast", 5798 "chromosome", 5799 "circular_RNA", 5800 "citation", 5801 "clone_lib", 5802 "clone", 5803 "coded_by", 5804 "codon_start", 5805 "codon", 5806 "collected_by", 5807 "collection_date", 5808 "compare", 5809 "cons_splice", 5810 "country", 5811 "cultivar", 5812 "culture_collection", 5813 "cyanelle", 5814 "db_xref", 5815 "derived_from", 5816 "dev_stage", 5817 "direction", 5818 "EC_number", 5819 "ecotype", 5820 "encodes", 5821 "endogenous_virus", 5822 "environmental_sample", 5823 "estimated_length", 5824 "evidence", 5825 "exception", 5826 "experiment", 5827 "focus", 5828 "frequency", 5829 "function", 5830 "gap_type", 5831 "gdb_xref", 5832 "gene_synonym", 5833 "gene", 5834 "germline", 5835 "haplogroup", 5836 "haplotype", 5837 "host", 5838 "identified_by", 5839 "inference", 5840 "insertion_seq", 5841 "isolate", 5842 "isolation_source", 5843 "kinetoplast", 5844 "lab_host", 5845 "label", 5846 "lat_lon", 5847 "linkage_evidence", 5848 "locus_tag", 5849 "macronuclear", 5850 "map", 5851 "mating_type", 5852 "metagenome_source", 5853 "metagenomic", 5854 "mitochondrion", 5855 "mobile_element_type", 5856 "mobile_element", 5857 "mod_base", 5858 "mol_type", 5859 "name", 5860 "nat_host", 5861 "ncRNA_class", 5862 "non_functional", 5863 "note", 5864 "number", 5865 "old_locus_tag", 5866 "operon", 5867 "organelle", 5868 "organism", 5869 "partial", 5870 "PCR_conditions", 5871 "PCR_primers", 5872 "peptide", 5873 "phenotype", 5874 "plasmid", 5875 "pop_variant", 5876 "product", 5877 "protein_id", 5878 "proviral", 5879 "pseudo", 5880 "pseudogene", 5881 "rearranged", 5882 "recombination_class", 5883 "region_name", 5884 "regulatory_class", 5885 "replace", 5886 "ribosomal_slippage", 5887 "rpt_family", 5888 "rpt_type", 5889 "rpt_unit_range", 5890 "rpt_unit_seq", 5891 "rpt_unit", 5892 "satellite", 5893 "segment", 5894 "sequenced_mol", 5895 "serotype", 5896 "serovar", 5897 "sex", 5898 "site_type", 5899 "specific_host", 5900 "specimen_voucher", 5901 "standard_name", 5902 "strain", 5903 "structural_class", 5904 "sub_clone", 5905 "sub_species", 5906 "sub_strain", 5907 "submitter_seqid", 5908 "tag_peptide", 5909 "tissue_lib", 5910 "tissue_type", 5911 "trans_splicing", 5912 "transcript_id", 5913 "transcription", 5914 "transgenic", 5915 "transl_except", 5916 "transl_table", 5917 "translation", 5918 "transposon", 5919 "type_material", 5920 "UniProtKB_evidence", 5921 "usedin", 5922 "variety", 5923 "virion", 5924 } 5925 5926 // legal INSDSeq XML fields 5927 5928 insdtags := []string{ 5929 "INSDAltSeqData_items", 5930 "INSDAltSeqData", 5931 "INSDAltSeqItem_first-accn", 5932 "INSDAltSeqItem_gap-comment", 5933 "INSDAltSeqItem_gap-length", 5934 "INSDAltSeqItem_gap-linkage", 5935 "INSDAltSeqItem_gap-type", 5936 "INSDAltSeqItem_interval", 5937 "INSDAltSeqItem_isgap", 5938 "INSDAltSeqItem_isgap@value", 5939 "INSDAltSeqItem_last-accn", 5940 "INSDAltSeqItem_value", 5941 "INSDAltSeqItem", 5942 "INSDAuthor", 5943 "INSDComment_paragraphs", 5944 "INSDComment_type", 5945 "INSDComment", 5946 "INSDCommentParagraph", 5947 "INSDFeature_intervals", 5948 "INSDFeature_key", 5949 "INSDFeature_location", 5950 "INSDFeature_operator", 5951 "INSDFeature_partial3", 5952 "INSDFeature_partial3@value", 5953 "INSDFeature_partial5", 5954 "INSDFeature_partial5@value", 5955 "INSDFeature_quals", 5956 "INSDFeature_xrefs", 5957 "INSDFeature", 5958 "INSDFeatureSet_annot-source", 5959 "INSDFeatureSet_features", 5960 "INSDFeatureSet", 5961 "INSDInterval_accession", 5962 "INSDInterval_from", 5963 "INSDInterval_interbp", 5964 "INSDInterval_interbp@value", 5965 "INSDInterval_iscomp", 5966 "INSDInterval_iscomp@value", 5967 "INSDInterval_point", 5968 "INSDInterval_to", 5969 "INSDInterval", 5970 "INSDKeyword", 5971 "INSDQualifier_name", 5972 "INSDQualifier_value", 5973 "INSDQualifier", 5974 "INSDReference_authors", 5975 "INSDReference_consortium", 5976 "INSDReference_journal", 5977 "INSDReference_position", 5978 "INSDReference_pubmed", 5979 "INSDReference_reference", 5980 "INSDReference_remark", 5981 "INSDReference_title", 5982 "INSDReference_xref", 5983 "INSDReference", 5984 "INSDSecondary-accn", 5985 "INSDSeq_accession-version", 5986 "INSDSeq_alt-seq", 5987 "INSDSeq_comment-set", 5988 "INSDSeq_comment", 5989 "INSDSeq_contig", 5990 "INSDSeq_create-date", 5991 "INSDSeq_create-release", 5992 "INSDSeq_database-reference", 5993 "INSDSeq_definition", 5994 "INSDSeq_division", 5995 "INSDSeq_entry-version", 5996 "INSDSeq_feature-set", 5997 "INSDSeq_feature-table", 5998 "INSDSeq_keywords", 5999 "INSDSeq_length", 6000 "INSDSeq_locus", 6001 "INSDSeq_moltype", 6002 "INSDSeq_organism", 6003 "INSDSeq_other-seqids", 6004 "INSDSeq_primary-accession", 6005 "INSDSeq_primary", 6006 "INSDSeq_project", 6007 "INSDSeq_references", 6008 "INSDSeq_secondary-accessions", 6009 "INSDSeq_segment", 6010 "INSDSeq_sequence", 6011 "INSDSeq_source-db", 6012 "INSDSeq_source", 6013 "INSDSeq_strandedness", 6014 "INSDSeq_struc-comments", 6015 "INSDSeq_taxonomy", 6016 "INSDSeq_topology", 6017 "INSDSeq_update-date", 6018 "INSDSeq_update-release", 6019 "INSDSeq_xrefs", 6020 "INSDSeq", 6021 "INSDSeqid", 6022 "INSDSet", 6023 "INSDStrucComment_items", 6024 "INSDStrucComment_name", 6025 "INSDStrucComment", 6026 "INSDStrucCommentItem_tag", 6027 "INSDStrucCommentItem_url", 6028 "INSDStrucCommentItem_value", 6029 "INSDStrucCommentItem", 6030 "INSDXref_dbname", 6031 "INSDXref_id", 6032 "INSDXref", 6033 } 6034 6035 checkAgainstVocabulary := func(str, objtype string, arry []string) { 6036 6037 if str == "" || arry == nil { 6038 return 6039 } 6040 6041 // skip past pound, percent, or caret character at beginning of string 6042 if len(str) > 1 { 6043 switch str[0] { 6044 case '#', '%', '^': 6045 str = str[1:] 6046 default: 6047 } 6048 } 6049 6050 for _, txt := range arry { 6051 if str == txt { 6052 return 6053 } 6054 if strings.ToUpper(str) == strings.ToUpper(txt) { 6055 fmt.Fprintf(os.Stderr, "\nERROR: Incorrect capitalization of '%s' %s, change to '%s'\n", str, objtype, txt) 6056 os.Exit(1) 6057 } 6058 } 6059 6060 fmt.Fprintf(os.Stderr, "\nERROR: Item '%s' is not a legal -insd %s\n", str, objtype) 6061 os.Exit(1) 6062 } 6063 6064 var acc []string 6065 6066 max := len(args) 6067 if max < 1 { 6068 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract -insd\n") 6069 os.Exit(1) 6070 } 6071 6072 if doIndex { 6073 if isPipe { 6074 acc = append(acc, "-head", "<IdxDocumentSet>", "-tail", "</IdxDocumentSet>") 6075 acc = append(acc, "-hd", " <IdxDocument>\n", "-tl", " </IdxDocument>") 6076 acc = append(acc, "-pattern", "INSDSeq", "-pfx", " <IdxUid>", "-sfx", "</IdxUid>\n") 6077 acc = append(acc, "-element", "INSDSeq_accession-version", "-clr", "-rst", "-tab", "\n") 6078 } else { 6079 acc = append(acc, "-head", "\"<IdxDocumentSet>\"", "-tail", "\"</IdxDocumentSet>\"") 6080 acc = append(acc, "-hd", "\" <IdxDocument>\\n\"", "-tl", "\" </IdxDocument>\"") 6081 acc = append(acc, "-pattern", "INSDSeq", "-pfx", "\" <IdxUid>\"", "-sfx", "\"</IdxUid>\\n\"") 6082 acc = append(acc, "-element", "INSDSeq_accession-version", "-clr", "-rst", "-tab", "\\n") 6083 } 6084 } else { 6085 acc = append(acc, "-pattern", "INSDSeq", "-ACCN", "INSDSeq_accession-version", "-SEQ", "INSDSeq_sequence") 6086 } 6087 6088 if doIndex { 6089 if isPipe { 6090 acc = append(acc, "-group", "INSDSeq", "-lbl", " <IdxSearchFields>\n") 6091 } else { 6092 acc = append(acc, "-group", "INSDSeq", "-lbl", "\" <IdxSearchFields>\\n\"") 6093 } 6094 } 6095 6096 printAccn := true 6097 6098 // collect descriptors 6099 6100 if strings.HasPrefix(args[0], "INSD") { 6101 6102 if doIndex { 6103 acc = append(acc, "-clr", "-indices") 6104 } else { 6105 if isPipe { 6106 acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN") 6107 acc = append(acc, "-group", "INSDSeq", "-sep", "|", "-element") 6108 } else { 6109 acc = append(acc, "-clr", "-pfx", "\"\\n\"", "-element", "\"&ACCN\"") 6110 acc = append(acc, "-group", "INSDSeq", "-sep", "\"|\"", "-element") 6111 } 6112 printAccn = false 6113 } 6114 6115 for { 6116 if len(args) < 1 { 6117 return acc 6118 } 6119 str := args[0] 6120 if !strings.HasPrefix(args[0], "INSD") { 6121 break 6122 } 6123 checkAgainstVocabulary(str, "element", insdtags) 6124 acc = append(acc, str) 6125 args = args[1:] 6126 } 6127 6128 } else if strings.HasPrefix(strings.ToUpper(args[0]), "INSD") { 6129 6130 // report capitalization or vocabulary failure 6131 checkAgainstVocabulary(args[0], "element", insdtags) 6132 6133 // program should not get to this point, but warn and exit anyway 6134 fmt.Fprintf(os.Stderr, "\nERROR: Item '%s' is not a legal -insd %s\n", args[0], "element") 6135 os.Exit(1) 6136 } 6137 6138 // collect qualifiers 6139 6140 partial := false 6141 complete := false 6142 6143 if args[0] == "+" || args[0] == "complete" { 6144 complete = true 6145 args = args[1:] 6146 max-- 6147 } else if args[0] == "-" || args[0] == "partial" { 6148 partial = true 6149 args = args[1:] 6150 max-- 6151 } 6152 6153 if max < 1 { 6154 fmt.Fprintf(os.Stderr, "\nERROR: No feature key supplied to xtract -insd\n") 6155 os.Exit(1) 6156 } 6157 6158 acc = append(acc, "-group", "INSDFeature") 6159 6160 // limit to designated features 6161 6162 feature := args[0] 6163 6164 fcmd := "-if" 6165 6166 // can specify multiple features separated by plus sign (e.g., CDS+mRNA) or comma (e.g., CDS,mRNA) 6167 plus := strings.Split(feature, "+") 6168 for _, pls := range plus { 6169 comma := strings.Split(pls, ",") 6170 for _, cma := range comma { 6171 6172 checkAgainstVocabulary(cma, "feature", features) 6173 acc = append(acc, fcmd, "INSDFeature_key", "-equals", cma) 6174 6175 fcmd = "-or" 6176 } 6177 } 6178 6179 if max < 2 { 6180 // still need at least one qualifier even on legal feature 6181 fmt.Fprintf(os.Stderr, "\nERROR: Feature '%s' must be followed by at least one qualifier\n", feature) 6182 os.Exit(1) 6183 } 6184 6185 args = args[1:] 6186 6187 if complete { 6188 acc = append(acc, "-unless", "INSDFeature_partial5", "-or", "INSDFeature_partial3") 6189 } else if partial { 6190 acc = append(acc, "-if", "INSDFeature_partial5", "-or", "INSDFeature_partial3") 6191 } 6192 6193 if printAccn { 6194 if doIndex { 6195 } else { 6196 if isPipe { 6197 acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN") 6198 } else { 6199 acc = append(acc, "-clr", "-pfx", "\"\\n\"", "-element", "\"&ACCN\"") 6200 } 6201 } 6202 } 6203 6204 for _, str := range args { 6205 6206 if str == "mol_wt" { 6207 str = "calculated_mol_wt" 6208 } 6209 6210 if strings.HasPrefix(str, "INSD") { 6211 6212 checkAgainstVocabulary(str, "element", insdtags) 6213 if doIndex { 6214 acc = append(acc, "-block", "INSDFeature", "-clr", "-indices") 6215 } else { 6216 if isPipe { 6217 acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element") 6218 } else { 6219 acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element") 6220 } 6221 } 6222 acc = append(acc, str) 6223 if addDash { 6224 acc = append(acc, "-block", "INSDFeature", "-unless", str) 6225 if strings.HasSuffix(str, "@value") { 6226 if isPipe { 6227 acc = append(acc, "-lbl", "false") 6228 } else { 6229 acc = append(acc, "-lbl", "\"false\"") 6230 } 6231 } else { 6232 if isPipe { 6233 acc = append(acc, "-lbl", "\\-") 6234 } else { 6235 acc = append(acc, "-lbl", "\"\\-\"") 6236 } 6237 } 6238 } 6239 6240 } else if strings.HasPrefix(str, "#INSD") { 6241 6242 checkAgainstVocabulary(str, "element", insdtags) 6243 if doIndex { 6244 acc = append(acc, "-block", "INSDFeature", "-clr", "-indices") 6245 } else { 6246 if isPipe { 6247 acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element") 6248 acc = append(acc, str) 6249 } else { 6250 acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element") 6251 ql := fmt.Sprintf("\"%s\"", str) 6252 acc = append(acc, ql) 6253 } 6254 } 6255 6256 } else if strings.HasPrefix(strings.ToUpper(str), "#INSD") { 6257 6258 // report capitalization or vocabulary failure 6259 checkAgainstVocabulary(str, "element", insdtags) 6260 6261 } else if str == "sub_sequence" { 6262 6263 // special sub_sequence qualifier shows sequence under feature intervals 6264 acc = append(acc, "-block", "INSDFeature_intervals") 6265 6266 acc = append(acc, "-subset", "INSDInterval", "-FR", "INSDInterval_from", "-TO", "INSDInterval_to") 6267 if isPipe { 6268 acc = append(acc, "-pfx", "", "-tab", "", "-nucleic", "&SEQ[&FR:&TO]") 6269 } else { 6270 acc = append(acc, "-pfx", "\"\"", "-tab", "\"\"", "-nucleic", "\"&SEQ[&FR:&TO]\"") 6271 } 6272 6273 acc = append(acc, "-subset", "INSDFeature_intervals") 6274 if isPipe { 6275 acc = append(acc, "-deq", "\\t") 6276 } else { 6277 acc = append(acc, "-deq", "\"\\t\"") 6278 } 6279 6280 } else if str == "feat_location" { 6281 6282 // special feat_location qualifier shows feature intervals 6283 acc = append(acc, "-block", "INSDFeature_intervals") 6284 6285 acc = append(acc, "-subset", "INSDInterval", "-FR", "INSDInterval_from", "-TO", "INSDInterval_to") 6286 if isPipe { 6287 acc = append(acc, "-pfx", "", "-tab", "..", "-element", "&FR") 6288 acc = append(acc, "-pfx", "", "-tab", ",", "-element", "&TO") 6289 } else { 6290 acc = append(acc, "-pfx", "\"\"", "-tab", "\"..\"", "-element", "\"&FR\"") 6291 acc = append(acc, "-pfx", "\"\"", "-tab", "\",\"", "-element", "\"&TO\"") 6292 } 6293 6294 acc = append(acc, "-subset", "INSDFeature_intervals") 6295 if isPipe { 6296 acc = append(acc, "-deq", "\\t") 6297 } else { 6298 acc = append(acc, "-deq", "\"\\t\"") 6299 } 6300 6301 } else if str == "chloroplast" || 6302 str == "chromoplast" || 6303 str == "cyanelle" || 6304 str == "environmental_sample" || 6305 str == "focus" || 6306 str == "germline" || 6307 str == "kinetoplast" || 6308 str == "macronuclear" || 6309 str == "metagenomic" || 6310 str == "mitochondrion" || 6311 str == "partial" || 6312 str == "proviral" || 6313 str == "pseudo" || 6314 str == "rearranged" || 6315 str == "ribosomal_slippage" || 6316 str == "trans_splicing" || 6317 str == "transgenic" || 6318 str == "virion" { 6319 6320 acc = append(acc, "-block", "INSDQualifier") 6321 6322 checkAgainstVocabulary(str, "qualifier", qualifiers) 6323 if doIndex { 6324 acc = append(acc, "-if", "INSDQualifier_name", "-equals", str) 6325 acc = append(acc, "-clr", "-indices", "INSDQualifier_name") 6326 } else { 6327 acc = append(acc, "-if", "INSDQualifier_name", "-equals", str) 6328 acc = append(acc, "-lbl", str) 6329 } 6330 if addDash { 6331 acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str) 6332 if isPipe { 6333 acc = append(acc, "-lbl", "\\-") 6334 } else { 6335 acc = append(acc, "-lbl", "\"\\-\"") 6336 } 6337 } 6338 6339 } else { 6340 6341 acc = append(acc, "-block", "INSDQualifier") 6342 6343 checkAgainstVocabulary(str, "qualifier", qualifiers) 6344 if len(str) > 2 && str[0] == '%' { 6345 acc = append(acc, "-if", "INSDQualifier_name", "-equals", str[1:]) 6346 if doIndex { 6347 if isPipe { 6348 acc = append(acc, "-clr", "-indices", "%INSDQualifier_value") 6349 } else { 6350 acc = append(acc, "-clr", "-indices", "\"%INSDQualifier_value\"") 6351 } 6352 } else { 6353 if isPipe { 6354 acc = append(acc, "-element", "%INSDQualifier_value") 6355 } else { 6356 acc = append(acc, "-element", "\"%INSDQualifier_value\"") 6357 } 6358 } 6359 if addDash { 6360 acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str[1:]) 6361 if isPipe { 6362 acc = append(acc, "-lbl", "\\-") 6363 } else { 6364 acc = append(acc, "-lbl", "\"\\-\"") 6365 } 6366 } 6367 } else { 6368 if doIndex { 6369 acc = append(acc, "-if", "INSDQualifier_name", "-equals", str) 6370 acc = append(acc, "-clr", "-indices", "INSDQualifier_value") 6371 } else { 6372 acc = append(acc, "-if", "INSDQualifier_name", "-equals", str) 6373 acc = append(acc, "-element", "INSDQualifier_value") 6374 } 6375 if addDash { 6376 acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str) 6377 if isPipe { 6378 acc = append(acc, "-lbl", "\\-") 6379 } else { 6380 acc = append(acc, "-lbl", "\"\\-\"") 6381 } 6382 } 6383 } 6384 } 6385 } 6386 6387 if doIndex { 6388 if isPipe { 6389 acc = append(acc, "-group", "INSDSeq", "-clr", "-lbl", " </IdxSearchFields>\n") 6390 } else { 6391 acc = append(acc, "-group", "INSDSeq", "-clr", "-lbl", "\" </IdxSearchFields>\\n\"") 6392 } 6393 } 6394 6395 return acc 6396} 6397 6398// BIOTHINGS EXTRACTION COMMAND GENERATOR 6399 6400// processBiopath generates extraction commands for BioThings resources (undocumented) 6401func processBiopath(args []string, isPipe bool) []string { 6402 6403 // nquire -get "http://myvariant.info/v1/variant/chr6:g.26093141G>A" \ 6404 // -fields clinvar.rcv.conditions.identifiers \ 6405 // -always_list clinvar.rcv.conditions.identifiers | 6406 // transmute -j2x | 6407 // xtract -biopath opt clinvar.rcv.conditions.identifiers.omim 6408 6409 var acc []string 6410 6411 max := len(args) 6412 if max < 2 { 6413 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract -biopath\n") 6414 os.Exit(1) 6415 } 6416 6417 obj := args[0] 6418 args = args[1:] 6419 6420 acc = append(acc, "-pattern", obj) 6421 6422 paths := args[0] 6423 6424 items := strings.Split(paths, ",") 6425 6426 for _, path := range items { 6427 6428 dirs := strings.Split(path, ".") 6429 max = len(dirs) 6430 if max < 1 { 6431 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient path arguments supplied to xtract -biopath\n") 6432 os.Exit(1) 6433 } 6434 if max > 7 { 6435 fmt.Fprintf(os.Stderr, "\nERROR: Too many nodes in argument supplied to xtract -biopath\n") 6436 os.Exit(1) 6437 } 6438 6439 str := dirs[max-1] 6440 6441 acc = append(acc, "-path") 6442 if isPipe { 6443 acc = append(acc, path) 6444 acc = append(acc, "-tab", "\\n") 6445 acc = append(acc, "-element", str) 6446 } else { 6447 acc = append(acc, "\""+path+"\"") 6448 acc = append(acc, "-tab", "\"\\n\"") 6449 acc = append(acc, "-element", "\""+str+"\"") 6450 } 6451 } 6452 6453 return acc 6454} 6455 6456// HYDRA CITATION MATCHER COMMAND GENERATOR 6457 6458// processHydra generates extraction commands for NCBI's in-house citation matcher (undocumented) 6459func processHydra(isPipe bool) []string { 6460 6461 var acc []string 6462 6463 // acceptable scores are 0.8 or higher, exact match on "1" rejects low value in scientific notation with minus sign present 6464 6465 acc = append(acc, "-pattern", "Id") 6466 acc = append(acc, "-if", "@score", "-equals", "1") 6467 acc = append(acc, "-or", "@score", "-starts-with", "0.9") 6468 acc = append(acc, "-or", "@score", "-starts-with", "0.8") 6469 acc = append(acc, "-element", "Id") 6470 6471 return acc 6472} 6473 6474// ENTREZ2INDEX COMMAND GENERATOR 6475 6476// processE2Index generates extraction commands to create input for Entrez2Index 6477func processE2Index(args []string, tform string, isPipe bool) []string { 6478 6479 var acc []string 6480 6481 max := len(args) 6482 if max < 3 { 6483 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to -e2index\n") 6484 os.Exit(1) 6485 } 6486 6487 year := "" 6488 patrn := args[0] 6489 args = args[1:] 6490 6491 isAllNumeric := func(str string) bool { 6492 6493 for _, ch := range str { 6494 if !unicode.IsDigit(ch) && 6495 ch != '.' && 6496 ch != '+' && 6497 ch != '-' && 6498 ch != '*' && 6499 ch != '/' && 6500 ch != ',' && 6501 ch != '$' && 6502 ch != '#' && 6503 ch != '%' && 6504 ch != '(' && 6505 ch != ')' { 6506 return false 6507 } 6508 } 6509 6510 return true 6511 } 6512 6513 if isAllNumeric(patrn) { 6514 year = patrn 6515 patrn = args[0] 6516 args = args[1:] 6517 } 6518 6519 ident := args[0] 6520 args = args[1:] 6521 6522 if !isPipe { 6523 if !deStop { 6524 acc = append(acc, "-stops") 6525 } 6526 if doStem { 6527 acc = append(acc, "-stems") 6528 } 6529 } 6530 6531 if isPipe { 6532 acc = append(acc, "-head", "<IdxDocumentSet>", "-tail", "</IdxDocumentSet>") 6533 acc = append(acc, "-hd", " <IdxDocument>\\n", "-tl", " </IdxDocument>") 6534 acc = append(acc, "-pattern") 6535 acc = append(acc, patrn) 6536 if year != "" { 6537 acc = append(acc, "-if", "PubDate/Year", "-ge", year) 6538 acc = append(acc, "-or", "PubDate/MedlineDate[1:4]", "-ge", year) 6539 } 6540 acc = append(acc, "-pfx", " <IdxUid>", "-sfx", "</IdxUid>\\n") 6541 acc = append(acc, "-element") 6542 acc = append(acc, ident) 6543 acc = append(acc, "-clr", "-rst", "-tab", "") 6544 acc = append(acc, "-lbl", " <IdxSearchFields>\\n") 6545 acc = append(acc, "-pfx", " <YEAR>", "-sfx", "</YEAR>\\n") 6546 acc = append(acc, "-year", "PubDate/*") 6547 acc = append(acc, "-clr", "-rst", "-tab", "") 6548 acc = append(acc, "-indices") 6549 for _, str := range args { 6550 acc = append(acc, str) 6551 } 6552 if tform != "" { 6553 acc = append(acc, "-clr", "-rst", "-tab", "\"\"") 6554 acc = append(acc, "-sep", ",", "-meshcode") 6555 acc = append(acc, "MeshHeading/DescriptorName@UI,Chemical/NameOfSubstance@UI,SupplMeshName@UI") 6556 } 6557 acc = append(acc, "-clr", "-lbl", " </IdxSearchFields>\\n") 6558 } else { 6559 acc = append(acc, "-head", "\"<IdxDocumentSet>\"", "-tail", "\"</IdxDocumentSet>\"") 6560 acc = append(acc, "-hd", "\" <IdxDocument>\\n\"", "-tl", "\" </IdxDocument>\"") 6561 acc = append(acc, "-pattern") 6562 ql := fmt.Sprintf("\"%s\"", patrn) 6563 acc = append(acc, ql) 6564 if year != "" { 6565 acc = append(acc, "-if", "PubDate/Year", "-ge", year) 6566 acc = append(acc, "-or", "PubDate/MedlineDate[1:4]", "-ge", year) 6567 } 6568 acc = append(acc, "-pfx", "\" <IdxUid>\"", "-sfx", "\"</IdxUid>\\n\"") 6569 acc = append(acc, "-element") 6570 ql = fmt.Sprintf("\"%s\"", ident) 6571 acc = append(acc, ql) 6572 acc = append(acc, "-clr", "-rst", "-tab", "\"\"") 6573 acc = append(acc, "-lbl", "\" <IdxSearchFields>\\n\"") 6574 acc = append(acc, "-pfx", "\" <YEAR>\"", "-sfx", "\"</YEAR>\\n\"") 6575 acc = append(acc, "-year", "\"PubDate/*\"") 6576 acc = append(acc, "-clr", "-rst", "-tab", "\"\"") 6577 acc = append(acc, "-indices") 6578 for _, str := range args { 6579 ql = fmt.Sprintf("\"%s\"", str) 6580 acc = append(acc, ql) 6581 } 6582 if tform != "" { 6583 acc = append(acc, "-clr", "-rst", "-tab", "\"\"") 6584 acc = append(acc, "-sep", "\",\"", "-meshcode") 6585 acc = append(acc, "\"MeshHeading/DescriptorName@UI,Chemical/NameOfSubstance@UI,SupplMeshName@UI\"") 6586 } 6587 acc = append(acc, "-clr", "-lbl", "\" </IdxSearchFields>\\n\"") 6588 } 6589 6590 return acc 6591} 6592 6593// CONCURRENT CONSUMER GOROUTINES PARSE AND PROCESS PARTITIONED XML OBJECTS 6594 6595// StreamBlocks -> SplitPattern => XmlParse => StreamTokens => ProcessQuery -> MergeResults 6596 6597// processes with single goroutine call defer close(out) so consumer(s) can range over channel 6598// processes with multiple instances call defer wg.Done(), separate goroutine uses wg.Wait() to delay close(out) 6599 6600func createConsumers(cmds *Block, parent, hd, tl string, transform map[string]string, histogram map[string]int, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { 6601 6602 if inp == nil { 6603 return nil 6604 } 6605 6606 out := make(chan eutils.XMLRecord, eutils.ChanDepth()) 6607 if out == nil { 6608 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create consumer channel\n") 6609 os.Exit(1) 6610 } 6611 6612 // xmlConsumer reads partitioned XML from channel and calls parser for processing 6613 xmlConsumer := func(cmds *Block, parent string, wg *sync.WaitGroup, inp <-chan eutils.XMLRecord, out chan<- eutils.XMLRecord) { 6614 6615 // report when this consumer has no more records to process 6616 defer wg.Done() 6617 6618 // read partitioned XML from producer channel 6619 for ext := range inp { 6620 6621 idx := ext.Index 6622 text := ext.Text 6623 6624 if text == "" { 6625 // should never see empty input data 6626 out <- eutils.XMLRecord{Index: idx, Text: text} 6627 continue 6628 } 6629 6630 str := processQuery(text[:], parent, idx, hd, tl, transform, histogram, cmds) 6631 6632 // send even if empty to get all record counts for reordering 6633 out <- eutils.XMLRecord{Index: idx, Text: str} 6634 } 6635 } 6636 6637 var wg sync.WaitGroup 6638 6639 // launch multiple consumer goroutines 6640 for i := 0; i < eutils.NumServe(); i++ { 6641 wg.Add(1) 6642 go xmlConsumer(cmds, parent, &wg, inp, out) 6643 } 6644 6645 // launch separate anonymous goroutine to wait until all consumers are done 6646 go func() { 6647 wg.Wait() 6648 close(out) 6649 }() 6650 6651 return out 6652} 6653 6654func createSelectors(parent, indx string, order map[string]bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord { 6655 6656 if parent == "" || indx == "" || order == nil || inp == nil { 6657 return nil 6658 } 6659 6660 find := eutils.ParseIndex(indx) 6661 6662 out := make(chan eutils.XMLRecord, eutils.ChanDepth()) 6663 if out == nil { 6664 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create selector channel\n") 6665 os.Exit(1) 6666 } 6667 6668 // xmlSelector reads partitioned XML from channel and matches identifiers of records to keep 6669 xmlSelector := func(wg *sync.WaitGroup, inp <-chan eutils.XMLRecord, out chan<- eutils.XMLRecord) { 6670 6671 // report when this selector has no more records to process 6672 defer wg.Done() 6673 6674 // read partitioned XML from producer channel 6675 for ext := range inp { 6676 6677 text := ext.Text 6678 6679 found := false 6680 6681 eutils.FindIdentifiers(text[:], parent, find, 6682 func(id string) { 6683 id = sortStringByWords(id) 6684 _, ok := order[id] 6685 if ok { 6686 found = true 6687 } 6688 }) 6689 6690 if !found { 6691 // identifier field not found or not in identifier list, send empty placeholder for unshuffler 6692 out <- eutils.XMLRecord{Index: ext.Index} 6693 continue 6694 } 6695 6696 // send selected record 6697 out <- eutils.XMLRecord{Index: ext.Index, Text: text} 6698 } 6699 } 6700 6701 var wg sync.WaitGroup 6702 6703 // launch multiple selector goroutines 6704 for i := 0; i < eutils.NumServe(); i++ { 6705 wg.Add(1) 6706 go xmlSelector(&wg, inp, out) 6707 } 6708 6709 // launch separate anonymous goroutine to wait until all selectors are done 6710 go func() { 6711 wg.Wait() 6712 close(out) 6713 }() 6714 6715 return out 6716} 6717 6718// MAIN FUNCTION 6719 6720// e.g., xtract -pattern PubmedArticle -element MedlineCitation/PMID -block Author -sep " " -element Initials,LastName 6721 6722func main() { 6723 6724 // skip past executable name 6725 args := os.Args[1:] 6726 6727 if len(args) < 1 { 6728 fmt.Fprintf(os.Stderr, "\nERROR: No command-line arguments supplied to xtract\n") 6729 os.Exit(1) 6730 } 6731 6732 // performance arguments 6733 chanDepth := 0 6734 farmSize := 0 6735 heapSize := 0 6736 numServe := 0 6737 goGc := 0 6738 6739 // processing option arguments 6740 doCompress := false 6741 doCleanup := false 6742 doStrict := false 6743 doMixed := false 6744 deAccent := false 6745 doASCII := false 6746 doStem = false 6747 deStop = true 6748 6749 /* 6750 doUnicode := false 6751 doScript := false 6752 doMathML := false 6753 */ 6754 6755 // CONCURRENCY, CLEANUP, AND DEBUGGING FLAGS 6756 6757 // do these first because -defcpu and -maxcpu can be sent from wrapper before other arguments 6758 6759 ncpu := runtime.NumCPU() 6760 if ncpu < 1 { 6761 ncpu = 1 6762 } 6763 6764 // wrapper can limit maximum number of processors to use (undocumented) 6765 maxProcs := 0 6766 defProcs := 0 6767 6768 // concurrent performance tuning parameters, can be overridden by -proc and -cons 6769 numProcs := 0 6770 serverRatio := 4 6771 6772 // -flag sets -strict or -mixed cleanup flags from argument 6773 flgs := "" 6774 6775 /* 6776 unicodePolicy := "" 6777 scriptPolicy := "" 6778 mathmlPolicy := "" 6779 */ 6780 6781 // read data from file instead of stdin 6782 fileName := "" 6783 6784 // debugging 6785 mpty := false 6786 idnt := false 6787 stts := false 6788 timr := false 6789 6790 // profiling 6791 prfl := false 6792 6793 // repeat the specified extraction 5 times for each -proc from 1 to nCPU 6794 trial := false 6795 6796 inSwitch := true 6797 6798 // get concurrency, cleanup, and debugging flags in any order 6799 for { 6800 6801 inSwitch = true 6802 6803 switch args[0] { 6804 // concurrency override arguments can be passed in by local wrapper script (undocumented) 6805 case "-maxcpu": 6806 maxProcs = eutils.GetNumericArg(args, "Maximum number of processors", 1, 1, ncpu) 6807 args = args[1:] 6808 case "-defcpu": 6809 defProcs = eutils.GetNumericArg(args, "Default number of processors", ncpu, 1, ncpu) 6810 args = args[1:] 6811 // performance tuning flags 6812 case "-proc": 6813 numProcs = eutils.GetNumericArg(args, "Number of processors", ncpu, 1, ncpu) 6814 args = args[1:] 6815 case "-cons": 6816 serverRatio = eutils.GetNumericArg(args, "Parser to processor ratio", 4, 1, 32) 6817 args = args[1:] 6818 case "-serv": 6819 numServe = eutils.GetNumericArg(args, "Concurrent parser count", 0, 1, 128) 6820 args = args[1:] 6821 case "-chan": 6822 chanDepth = eutils.GetNumericArg(args, "Communication channel depth", 0, ncpu, 128) 6823 args = args[1:] 6824 case "-heap": 6825 heapSize = eutils.GetNumericArg(args, "Unshuffler heap size", 8, 8, 64) 6826 args = args[1:] 6827 case "-farm": 6828 farmSize = eutils.GetNumericArg(args, "Node buffer length", 4, 4, 2048) 6829 args = args[1:] 6830 case "-gogc": 6831 goGc = eutils.GetNumericArg(args, "Garbage collection percentage", 0, 50, 1000) 6832 args = args[1:] 6833 6834 // read data from file 6835 case "-input": 6836 fileName = eutils.GetStringArg(args, "Input file name") 6837 args = args[1:] 6838 6839 // data cleanup flags 6840 case "-compress", "-compressed": 6841 doCompress = true 6842 case "-spaces", "-cleanup": 6843 doCleanup = true 6844 case "-strict": 6845 doStrict = true 6846 case "-mixed": 6847 doMixed = true 6848 case "-accent": 6849 deAccent = true 6850 case "-ascii": 6851 doASCII = true 6852 6853 // previously visible processing flags (undocumented) 6854 case "-stems", "-stem": 6855 doStem = true 6856 case "-stops", "-stop": 6857 deStop = false 6858 6859 // allow setting of unicode, script, and mathml flags (undocumented) 6860 case "-unicode": 6861 // unicodePolicy = GetStringArg(args, "Unicode argument") 6862 args = args[1:] 6863 case "-script": 6864 // scriptPolicy = GetStringArg(args, "Script argument") 6865 args = args[1:] 6866 case "-mathml": 6867 // mathmlPolicy = GetStringArg(args, "MathML argument") 6868 args = args[1:] 6869 6870 case "-flag", "-flags": 6871 flgs = eutils.GetStringArg(args, "Flags argument") 6872 args = args[1:] 6873 6874 // debugging flags 6875 case "-debug": 6876 // dbug = true 6877 case "-empty": 6878 mpty = true 6879 case "-ident": 6880 idnt = true 6881 case "-stats", "-stat": 6882 stts = true 6883 case "-timer": 6884 timr = true 6885 case "-profile": 6886 prfl = true 6887 case "-trial", "-trials": 6888 trial = true 6889 6890 default: 6891 // if not any of the controls, set flag to break out of for loop 6892 inSwitch = false 6893 } 6894 6895 if !inSwitch { 6896 break 6897 } 6898 6899 // skip past argument 6900 args = args[1:] 6901 6902 if len(args) < 1 { 6903 break 6904 } 6905 } 6906 6907 // -flag allows script to set -strict or -mixed (or -stems, or -stops) from argument 6908 switch flgs { 6909 case "strict": 6910 doStrict = true 6911 case "mixed": 6912 doMixed = true 6913 case "stems", "stem": 6914 doStem = true 6915 case "stops", "stop": 6916 deStop = false 6917 case "none", "default": 6918 default: 6919 if flgs != "" { 6920 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized -flag value '%s'\n", flgs) 6921 os.Exit(1) 6922 } 6923 } 6924 6925 /* 6926 UnicodeFix = parseMarkup(unicodePolicy, "-unicode") 6927 ScriptFix = parseMarkup(scriptPolicy, "-script") 6928 MathMLFix = parseMarkup(mathmlPolicy, "-mathml") 6929 6930 if UnicodeFix != NOMARKUP { 6931 doUnicode = true 6932 } 6933 6934 if ScriptFix != NOMARKUP { 6935 doScript = true 6936 } 6937 6938 if MathMLFix != NOMARKUP { 6939 doMathML = true 6940 } 6941 */ 6942 6943 if numProcs == 0 { 6944 if defProcs > 0 { 6945 numProcs = defProcs 6946 } else if maxProcs > 0 { 6947 numProcs = maxProcs 6948 } 6949 } 6950 if numProcs > ncpu { 6951 numProcs = ncpu 6952 } 6953 if numProcs > maxProcs { 6954 numProcs = maxProcs 6955 } 6956 6957 eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc) 6958 6959 eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup) 6960 6961 // -stats prints number of CPUs and performance tuning values if no other arguments (undocumented) 6962 if stts && len(args) < 1 { 6963 6964 eutils.PrintStats() 6965 6966 return 6967 } 6968 6969 if len(args) < 1 { 6970 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n") 6971 os.Exit(1) 6972 } 6973 6974 // DOCUMENTATION COMMANDS 6975 6976 inSwitch = true 6977 6978 switch args[0] { 6979 case "-version": 6980 fmt.Printf("%s\n", eutils.EDirectVersion) 6981 case "-help": 6982 fmt.Printf("xtract %s\n%s\n", eutils.EDirectVersion, xtractHelp) 6983 case "-examples", "-example": 6984 ex, eerr := os.Executable() 6985 if eerr == nil { 6986 fmt.Printf("xtract %s\n\n", eutils.EDirectVersion) 6987 exPath := filepath.Dir(ex) 6988 fpath := path.Join(exPath, "hlp-xtract.txt") 6989 file, ferr := os.Open(fpath) 6990 if file != nil && ferr == nil { 6991 scanner := bufio.NewScanner(file) 6992 for scanner.Scan() { 6993 fmt.Println(scanner.Text()) 6994 } 6995 } 6996 file.Close() 6997 fmt.Printf("\n") 6998 } 6999 case "-extras", "-extra", "-advanced": 7000 fmt.Printf("Please run rchive -help for local record indexing information\n") 7001 case "-internal", "-internals": 7002 fmt.Printf("xtract %s\n%s\n", eutils.EDirectVersion, xtractInternal) 7003 case "-keys": 7004 fmt.Printf("%s\n", keyboardShortcuts) 7005 case "-unix": 7006 fmt.Printf("%s\n", unixCommands) 7007 default: 7008 // if not any of the documentation commands, keep going 7009 inSwitch = false 7010 } 7011 7012 if inSwitch { 7013 return 7014 } 7015 7016 // FILE NAME CAN BE SUPPLIED WITH -input COMMAND 7017 7018 in := os.Stdin 7019 7020 // check for data being piped into stdin 7021 isPipe := false 7022 fi, err := os.Stdin.Stat() 7023 if err == nil { 7024 isPipe = bool((fi.Mode() & os.ModeNamedPipe) != 0) 7025 } 7026 7027 usingFile := false 7028 7029 if fileName != "" { 7030 7031 inFile, err := os.Open(fileName) 7032 if err != nil { 7033 fmt.Fprintf(os.Stderr, "\nERROR: Unable to open input file '%s'\n", fileName) 7034 os.Exit(1) 7035 } 7036 7037 defer inFile.Close() 7038 7039 // use indicated file instead of stdin 7040 in = inFile 7041 usingFile = true 7042 7043 if isPipe && runtime.GOOS != "windows" { 7044 mode := fi.Mode().String() 7045 fmt.Fprintf(os.Stderr, "\nERROR: Input data from both stdin and file '%s', mode is '%s'\n", fileName, mode) 7046 os.Exit(1) 7047 } 7048 } 7049 7050 // check for -input command after extraction arguments 7051 for _, str := range args { 7052 if str == "-input" { 7053 fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -input command\n") 7054 os.Exit(1) 7055 } 7056 } 7057 7058 // START PROFILING IF REQUESTED 7059 7060 if prfl { 7061 7062 f, err := os.Create("cpu.pprof") 7063 if err != nil { 7064 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create profile output file\n") 7065 os.Exit(1) 7066 } 7067 7068 pprof.StartCPUProfile(f) 7069 7070 defer pprof.StopCPUProfile() 7071 } 7072 7073 // INITIALIZE RECORD COUNT 7074 7075 recordCount := 0 7076 byteCount := 0 7077 7078 // print processing rate and program duration 7079 printDuration := func(name string) { 7080 7081 eutils.PrintDuration(name, recordCount, byteCount) 7082 } 7083 7084 // NAME OF OUTPUT STRING TRANSFORMATION FILE 7085 7086 tform := "" 7087 transform := make(map[string]string) 7088 7089 populateTx := func(tf string) { 7090 7091 inFile, err := os.Open(tf) 7092 if err != nil { 7093 fmt.Fprintf(os.Stderr, "Unable to open transformation file %s\n", err.Error()) 7094 os.Exit(1) 7095 } 7096 defer inFile.Close() 7097 7098 scanr := bufio.NewScanner(inFile) 7099 7100 // populate transformation map for -translate (and -matrix) output 7101 for scanr.Scan() { 7102 7103 line := scanr.Text() 7104 frst, scnd := eutils.SplitInTwoLeft(line, "\t") 7105 7106 transform[frst] = scnd 7107 } 7108 } 7109 7110 if len(args) > 2 && args[0] == "-transform" { 7111 tform = args[1] 7112 args = args[2:] 7113 if tform != "" { 7114 populateTx(tform) 7115 } 7116 } 7117 7118 // CREATE XML BLOCK READER FROM STDIN OR FILE 7119 7120 rdr := eutils.CreateXMLStreamer(in) 7121 if rdr == nil { 7122 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create XML Block Reader\n") 7123 os.Exit(1) 7124 } 7125 7126 // SEQUENCE RECORD EXTRACTION COMMAND GENERATOR 7127 7128 // -insd simplifies extraction of INSDSeq qualifiers 7129 if args[0] == "-insd" || args[0] == "-insd-" || args[0] == "-insd-idx" { 7130 7131 addDash := true 7132 doIndex := false 7133 // -insd- variant suppresses use of dash as placeholder for missing qualifiers (undocumented) 7134 if args[0] == "-insd-" { 7135 addDash = false 7136 } 7137 // -insd-idx variant creates word index using -indices command (undocumented) 7138 if args[0] == "-insd-idx" { 7139 doIndex = true 7140 addDash = false 7141 } 7142 7143 args = args[1:] 7144 7145 insd := processINSD(args, isPipe || usingFile, addDash, doIndex) 7146 7147 if !isPipe && !usingFile { 7148 // no piped input, so write output instructions 7149 fmt.Printf("xtract") 7150 for _, str := range insd { 7151 fmt.Printf(" %s", str) 7152 } 7153 fmt.Printf("\n") 7154 return 7155 } 7156 7157 // data in pipe, so replace arguments, execute dynamically 7158 args = insd 7159 } 7160 7161 // CITATION MATCHER EXTRACTION COMMAND GENERATOR 7162 7163 // -hydra filters HydraResponse output by relevance score (undocumented) 7164 if args[0] == "-hydra" { 7165 7166 hydra := processHydra(isPipe || usingFile) 7167 7168 if !isPipe && !usingFile { 7169 // no piped input, so write output instructions 7170 fmt.Printf("xtract") 7171 for _, str := range hydra { 7172 fmt.Printf(" %s", str) 7173 } 7174 fmt.Printf("\n") 7175 return 7176 } 7177 7178 // data in pipe, so replace arguments, execute dynamically 7179 args = hydra 7180 } 7181 7182 // BIOTHINGS EXTRACTION COMMAND GENERATOR 7183 7184 // -biopath takes a parent object and a dotted exploration path for BioThings resources (undocumented) 7185 if args[0] == "-biopath" { 7186 7187 args = args[1:] 7188 7189 biopath := processBiopath(args, isPipe || usingFile) 7190 7191 if !isPipe && !usingFile { 7192 // no piped input, so write output instructions 7193 fmt.Printf("xtract") 7194 for _, str := range biopath { 7195 fmt.Printf(" %s", str) 7196 } 7197 fmt.Printf("\n") 7198 return 7199 } 7200 7201 // data in pipe, so replace arguments, execute dynamically 7202 args = biopath 7203 } 7204 7205 // ENTREZ2INDEX COMMAND GENERATOR 7206 7207 // -e2index shortcut for experimental indexing code (documented in rchive.go) 7208 if args[0] == "-e2index" { 7209 7210 // e.g., xtract -transform "$EDIRECT_MESH_TREE" -e2index 7211 7212 args = args[1:] 7213 7214 if len(args) == 0 { 7215 // if no arguments, use default values 7216 args = []string{"PubmedArticle", "MedlineCitation/PMID", "ArticleTitle,Abstract/AbstractText"} 7217 } 7218 7219 // environment variable can override garbage collector (undocumented) 7220 gcEnv := os.Getenv("EDIRECT_INDEX_GOGC") 7221 if gcEnv != "" { 7222 val, err := strconv.Atoi(gcEnv) 7223 if err == nil { 7224 if val >= 50 && val <= 1000 { 7225 debug.SetGCPercent(val) 7226 } else { 7227 debug.SetGCPercent(100) 7228 } 7229 } 7230 } 7231 7232 // environment variable can override number of servers (undocumented) 7233 svEnv := os.Getenv("EDIRECT_INDEX_SERV") 7234 if svEnv != "" { 7235 val, err := strconv.Atoi(svEnv) 7236 if err == nil { 7237 if val >= 1 && val <= 128 { 7238 numServe = val 7239 } else { 7240 numServe = 1 7241 } 7242 eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc) 7243 } 7244 } 7245 7246 res := processE2Index(args, tform, isPipe || usingFile) 7247 7248 if !isPipe && !usingFile { 7249 // no piped input, so write output instructions 7250 fmt.Printf("xtract") 7251 if tform != "" { 7252 fmt.Printf(" -transform %s", tform) 7253 } 7254 for _, str := range res { 7255 fmt.Printf(" %s", str) 7256 } 7257 fmt.Printf("\n") 7258 return 7259 } 7260 7261 // data in pipe, so replace arguments, execute dynamically 7262 args = res 7263 } 7264 7265 // CONFIRM INPUT DATA AVAILABILITY AFTER RUNNING COMMAND GENERATORS 7266 7267 if fileName == "" && runtime.GOOS != "windows" { 7268 7269 fromStdin := bool((fi.Mode() & os.ModeCharDevice) == 0) 7270 if !isPipe || !fromStdin { 7271 mode := fi.Mode().String() 7272 fmt.Fprintf(os.Stderr, "\nERROR: No data supplied to xtract from stdin or file, mode is '%s'\n", mode) 7273 os.Exit(1) 7274 } 7275 } 7276 7277 if !usingFile && !isPipe { 7278 7279 fmt.Fprintf(os.Stderr, "\nERROR: No XML input data supplied to xtract\n") 7280 os.Exit(1) 7281 } 7282 7283 // XML VALIDATION 7284 7285 nextArg := func() (string, bool) { 7286 7287 if len(args) < 1 { 7288 return "", false 7289 } 7290 7291 // remove next token from slice 7292 nxt := args[0] 7293 args = args[1:] 7294 7295 return nxt, true 7296 } 7297 7298 if args[0] == "-verify" || args[0] == "-validate" { 7299 7300 // skip past command name 7301 args = args[1:] 7302 7303 find := "" 7304 html := false 7305 7306 // look for optional arguments 7307 for { 7308 arg, ok := nextArg() 7309 if !ok { 7310 break 7311 } 7312 7313 switch arg { 7314 case "-find": 7315 // override set wrapper 7316 find, ok = nextArg() 7317 case "-html": 7318 html = true 7319 } 7320 } 7321 7322 recordCount = eutils.ValidateXML(rdr, find, html) 7323 7324 debug.FreeOSMemory() 7325 7326 // suppress printing of lines if not properly counted 7327 if recordCount == 1 { 7328 recordCount = 0 7329 } 7330 7331 if timr { 7332 printDuration("lines") 7333 } 7334 7335 return 7336 } 7337 7338 // MISCELLANEOUS TIMING COMMANDS 7339 7340 if args[0] == "-chunk" { 7341 7342 for str := range rdr { 7343 recordCount++ 7344 byteCount += len(str) 7345 } 7346 7347 printDuration("blocks") 7348 7349 return 7350 } 7351 7352 if args[0] == "-split" { 7353 7354 if len(args) > 1 { 7355 if args[1] == "-pattern" { 7356 // skip past -split if followed by -pattern 7357 args = args[1:] 7358 } 7359 } 7360 if len(args) < 2 { 7361 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -split command\n") 7362 os.Exit(1) 7363 } 7364 pat := args[1] 7365 7366 eutils.PartitionPattern(pat, "", rdr, 7367 func(str string) { 7368 recordCount++ 7369 byteCount += len(str) 7370 }) 7371 7372 printDuration("patterns") 7373 7374 return 7375 } 7376 7377 if args[0] == "-token" { 7378 7379 eutils.StreamTokens(rdr, 7380 func(tkn eutils.XMLToken) { 7381 recordCount++ 7382 byteCount += len(tkn.Name) + len(tkn.Attr) 7383 }) 7384 7385 printDuration("tokens") 7386 7387 return 7388 } 7389 7390 // SPECIFY STRINGS TO GO BEFORE AND AFTER ENTIRE OUTPUT OR EACH RECORD 7391 7392 head := "" 7393 tail := "" 7394 7395 hd := "" 7396 tl := "" 7397 7398 for { 7399 7400 inSwitch = true 7401 7402 switch args[0] { 7403 case "-head": 7404 if len(args) < 2 { 7405 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -head command\n") 7406 os.Exit(1) 7407 } 7408 head = eutils.ConvertSlash(args[1]) 7409 // allow splitting of -head argument, keep appending until next command (undocumented) 7410 ofs, nxt := 0, args[2:] 7411 for { 7412 if len(nxt) < 1 { 7413 break 7414 } 7415 tmp := nxt[0] 7416 if strings.HasPrefix(tmp, "-") { 7417 break 7418 } 7419 ofs++ 7420 txt := eutils.ConvertSlash(tmp) 7421 if head != "" && !strings.HasSuffix(head, "\t") { 7422 head += "\t" 7423 } 7424 head += txt 7425 nxt = nxt[1:] 7426 } 7427 if ofs > 0 { 7428 args = args[ofs:] 7429 } 7430 case "-tail": 7431 if len(args) < 2 { 7432 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -tail command\n") 7433 os.Exit(1) 7434 } 7435 tail = eutils.ConvertSlash(args[1]) 7436 case "-hd": 7437 if len(args) < 2 { 7438 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -hd command\n") 7439 os.Exit(1) 7440 } 7441 hd = eutils.ConvertSlash(args[1]) 7442 case "-tl": 7443 if len(args) < 2 { 7444 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -tl command\n") 7445 os.Exit(1) 7446 } 7447 tl = eutils.ConvertSlash(args[1]) 7448 case "-wrp": 7449 // shortcut to wrap records in XML tags 7450 if len(args) < 2 { 7451 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -wrp command\n") 7452 os.Exit(1) 7453 } 7454 tmp := eutils.ConvertSlash(args[1]) 7455 lft, rgt := eutils.SplitInTwoLeft(tmp, ",") 7456 if lft != "" { 7457 head = "<" + lft + ">" 7458 tail = "</" + lft + ">" 7459 } 7460 if rgt != "" { 7461 hd = "<" + rgt + ">" 7462 tl = "</" + rgt + ">" 7463 } 7464 case "-set": 7465 if len(args) < 2 { 7466 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -set command\n") 7467 os.Exit(1) 7468 } 7469 tmp := eutils.ConvertSlash(args[1]) 7470 if tmp != "" { 7471 head = "<" + tmp + ">" 7472 tail = "</" + tmp + ">" 7473 } 7474 case "-rec": 7475 if len(args) < 2 { 7476 fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -rec command\n") 7477 os.Exit(1) 7478 } 7479 tmp := eutils.ConvertSlash(args[1]) 7480 if tmp != "" { 7481 hd = "<" + tmp + ">" 7482 tl = "</" + tmp + ">" 7483 } 7484 default: 7485 // if not any of the controls, set flag to break out of for loop 7486 inSwitch = false 7487 } 7488 7489 if !inSwitch { 7490 break 7491 } 7492 7493 // skip past arguments 7494 args = args[2:] 7495 7496 if len(args) < 1 { 7497 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n") 7498 os.Exit(1) 7499 } 7500 } 7501 7502 // ENSURE PRESENCE OF PATTERN ARGUMENT 7503 7504 if len(args) < 1 { 7505 fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n") 7506 os.Exit(1) 7507 } 7508 7509 // allow -record as synonym of -pattern (undocumented) 7510 if args[0] == "-record" || args[0] == "-Record" { 7511 args[0] = "-pattern" 7512 } 7513 7514 // make sure top-level -pattern command is next 7515 if args[0] != "-pattern" && args[0] != "-Pattern" { 7516 fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n") 7517 os.Exit(1) 7518 } 7519 if len(args) < 2 { 7520 fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n") 7521 os.Exit(1) 7522 } 7523 7524 topPat := args[1] 7525 if topPat == "" { 7526 fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n") 7527 os.Exit(1) 7528 } 7529 if strings.HasPrefix(topPat, "-") { 7530 fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", topPat) 7531 os.Exit(1) 7532 } 7533 7534 // look for -pattern Parent/* construct for heterogeneous data, e.g., -pattern PubmedArticleSet/* 7535 topPattern, star := eutils.SplitInTwoLeft(topPat, "/") 7536 if topPattern == "" { 7537 return 7538 } 7539 7540 parent := "" 7541 if star == "*" { 7542 parent = topPattern 7543 } else if star != "" { 7544 fmt.Fprintf(os.Stderr, "\nERROR: -pattern Parent/Child construct is not supported\n") 7545 os.Exit(1) 7546 } 7547 7548 // READ FILE OF IDENTIFIERS AND CONCURRENTLY EXTRACT SELECTED RECORDS 7549 7550 // -pattern record_name -select parent/element@attribute^version -in file_of_identifiers 7551 if len(args) == 6 && args[2] == "-select" && (args[4] == "-in" || args[4] == "-retaining") { 7552 7553 indx := args[3] 7554 unqe := args[5] 7555 7556 // read file of identifiers to use for filtering 7557 fl, err := os.Open(unqe) 7558 if err != nil { 7559 fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", unqe) 7560 os.Exit(1) 7561 } 7562 7563 // create map that records each UID 7564 order := make(map[string]bool) 7565 7566 scanr := bufio.NewScanner(fl) 7567 7568 // read lines of identifiers 7569 for scanr.Scan() { 7570 7571 line := scanr.Text() 7572 id, _ := eutils.SplitInTwoLeft(line, "\t") 7573 7574 id = sortStringByWords(id) 7575 7576 // add identifier to map 7577 order[id] = true 7578 } 7579 7580 fl.Close() 7581 7582 xmlq := eutils.CreateXMLProducer(topPattern, star, rdr) 7583 fchq := createSelectors(topPattern, indx, order, xmlq) 7584 unsq := eutils.CreateXMLUnshuffler(fchq) 7585 7586 if xmlq == nil || fchq == nil || unsq == nil { 7587 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create selector\n") 7588 os.Exit(1) 7589 } 7590 7591 if head != "" { 7592 os.Stdout.WriteString(head) 7593 os.Stdout.WriteString("\n") 7594 } 7595 7596 // drain output channel 7597 for curr := range unsq { 7598 7599 str := curr.Text 7600 7601 if str == "" { 7602 continue 7603 } 7604 7605 if hd != "" { 7606 os.Stdout.WriteString(hd) 7607 os.Stdout.WriteString("\n") 7608 } 7609 7610 // send result to output 7611 os.Stdout.WriteString(str) 7612 if !strings.HasSuffix(str, "\n") { 7613 os.Stdout.WriteString("\n") 7614 } 7615 7616 if tl != "" { 7617 os.Stdout.WriteString(tl) 7618 os.Stdout.WriteString("\n") 7619 } 7620 7621 recordCount++ 7622 runtime.Gosched() 7623 } 7624 7625 if tail != "" { 7626 os.Stdout.WriteString(tail) 7627 os.Stdout.WriteString("\n") 7628 } 7629 7630 debug.FreeOSMemory() 7631 7632 if timr { 7633 printDuration("records") 7634 } 7635 7636 return 7637 } 7638 7639 // READ FILE OF IDENTIFIERS AND EXCLUDE SELECTED RECORDS 7640 7641 // -pattern record_name -exclude element -excluding file_of_identifiers (undocumented) 7642 if len(args) == 6 && args[2] == "-select" && args[4] == "-excluding" { 7643 7644 indx := args[3] 7645 unqe := args[5] 7646 7647 // read file of identifiers to use for filtering 7648 fl, err := os.Open(unqe) 7649 if err != nil { 7650 fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", unqe) 7651 os.Exit(1) 7652 } 7653 7654 // create map that records each UID 7655 order := make(map[string]bool) 7656 7657 scanr := bufio.NewScanner(fl) 7658 7659 // read lines of identifiers 7660 for scanr.Scan() { 7661 7662 line := scanr.Text() 7663 id, _ := eutils.SplitInTwoLeft(line, "\t") 7664 id = strings.ToLower(id) 7665 7666 // add identifier to map 7667 order[id] = true 7668 } 7669 7670 fl.Close() 7671 7672 find := eutils.ParseIndex(indx) 7673 7674 if head != "" { 7675 os.Stdout.WriteString(head) 7676 os.Stdout.WriteString("\n") 7677 } 7678 7679 eutils.PartitionPattern(topPattern, star, rdr, 7680 func(str string) { 7681 recordCount++ 7682 7683 id := eutils.FindIdentifier(str[:], parent, find) 7684 if id != "" { 7685 id = strings.ToLower(id) 7686 _, ok := order[id] 7687 if ok { 7688 // in exclusion list, skip 7689 return 7690 } 7691 } 7692 7693 if hd != "" { 7694 os.Stdout.WriteString(hd) 7695 os.Stdout.WriteString("\n") 7696 } 7697 7698 // write selected record 7699 os.Stdout.WriteString(str[:]) 7700 os.Stdout.WriteString("\n") 7701 7702 if tl != "" { 7703 os.Stdout.WriteString(tl) 7704 os.Stdout.WriteString("\n") 7705 } 7706 }) 7707 7708 if tail != "" { 7709 os.Stdout.WriteString(tail) 7710 os.Stdout.WriteString("\n") 7711 } 7712 7713 debug.FreeOSMemory() 7714 7715 if timr { 7716 printDuration("records") 7717 } 7718 7719 return 7720 } 7721 7722 // READ ORDERED FILE OF IDENTIFIERS AND XML STRINGS, APPEND XML JUST INSIDE CLOSING TAG OF APPROPRIATE RECORD 7723 7724 // -pattern record_name -select element -appending file_of_identifiers_and_metadata (undocumented) 7725 if len(args) == 6 && args[2] == "-select" && args[4] == "-appending" { 7726 7727 indx := args[3] 7728 apnd := args[5] 7729 7730 fl, err := os.Open(apnd) 7731 if err != nil { 7732 fmt.Fprintf(os.Stderr, "\nERROR: Unable to open transformation file '%s'\n", apnd) 7733 os.Exit(1) 7734 } 7735 7736 scanr := bufio.NewScanner(fl) 7737 7738 find := eutils.ParseIndex(indx) 7739 7740 if head != "" { 7741 os.Stdout.WriteString(head) 7742 os.Stdout.WriteString("\n") 7743 } 7744 7745 rgt := "</" + topPattern + ">" 7746 7747 eutils.PartitionPattern(topPattern, star, rdr, 7748 func(str string) { 7749 recordCount++ 7750 7751 id := eutils.FindIdentifier(str[:], parent, find) 7752 if id == "" { 7753 return 7754 } 7755 id = strings.ToLower(id) 7756 7757 for scanr.Scan() { 7758 7759 line := scanr.Text() 7760 frst, scnd := eutils.SplitInTwoLeft(line, "\t") 7761 frst = strings.ToLower(frst) 7762 7763 if id != frst { 7764 return 7765 } 7766 if !strings.HasSuffix(str, rgt) { 7767 return 7768 } 7769 7770 lft := strings.TrimSuffix(str, rgt) 7771 str = lft + " " + scnd + "\n" + rgt 7772 7773 if hd != "" { 7774 os.Stdout.WriteString(hd) 7775 os.Stdout.WriteString("\n") 7776 } 7777 7778 os.Stdout.WriteString(str[:]) 7779 os.Stdout.WriteString("\n") 7780 7781 if tl != "" { 7782 os.Stdout.WriteString(tl) 7783 os.Stdout.WriteString("\n") 7784 } 7785 7786 break 7787 } 7788 }) 7789 7790 if tail != "" { 7791 os.Stdout.WriteString(tail) 7792 os.Stdout.WriteString("\n") 7793 } 7794 7795 fl.Close() 7796 7797 debug.FreeOSMemory() 7798 7799 if timr { 7800 printDuration("records") 7801 } 7802 7803 return 7804 } 7805 7806 // SORT XML RECORDS BY IDENTIFIER 7807 7808 // -pattern record_name -sort parent/element@attribute^version 7809 if len(args) == 4 && args[2] == "-sort" { 7810 7811 indx := args[3] 7812 7813 // create map that records each UID 7814 order := make(map[string][]string) 7815 7816 find := eutils.ParseIndex(indx) 7817 7818 eutils.PartitionPattern(topPattern, star, rdr, 7819 func(str string) { 7820 recordCount++ 7821 7822 id := eutils.FindIdentifier(str[:], parent, find) 7823 if id == "" { 7824 return 7825 } 7826 7827 data, ok := order[id] 7828 if !ok { 7829 data = make([]string, 0, 1) 7830 } 7831 data = append(data, str) 7832 // always need to update order, since data may be reallocated 7833 order[id] = data 7834 }) 7835 7836 var keys []string 7837 for ky := range order { 7838 keys = append(keys, ky) 7839 } 7840 // sort fields in alphabetical or numeric order 7841 sort.Slice(keys, func(i, j int) bool { 7842 // numeric sort on strings checks lengths first 7843 if eutils.IsAllDigits(keys[i]) && eutils.IsAllDigits(keys[j]) { 7844 lni := len(keys[i]) 7845 lnj := len(keys[j]) 7846 // shorter string is numerically less, assuming no leading zeros 7847 if lni < lnj { 7848 return true 7849 } 7850 if lni > lnj { 7851 return false 7852 } 7853 } 7854 // same length or non-numeric, can now do string comparison on contents 7855 return keys[i] < keys[j] 7856 }) 7857 7858 if head != "" { 7859 os.Stdout.WriteString(head) 7860 os.Stdout.WriteString("\n") 7861 } 7862 7863 for _, id := range keys { 7864 7865 strs := order[id] 7866 for _, str := range strs { 7867 os.Stdout.WriteString(str) 7868 os.Stdout.WriteString("\n") 7869 } 7870 } 7871 7872 if tail != "" { 7873 os.Stdout.WriteString(tail) 7874 os.Stdout.WriteString("\n") 7875 } 7876 7877 debug.FreeOSMemory() 7878 7879 if timr { 7880 printDuration("records") 7881 } 7882 7883 return 7884 } 7885 7886 // SPLIT FILE BY BY RECORD COUNT 7887 7888 // split XML record into subfiles by count 7889 if len(args) == 8 && args[2] == "-split" && args[4] == "-prefix" && args[6] == "-suffix" { 7890 7891 // e.g., -head "<IdxDocumentSet>" -tail "</IdxDocumentSet>" -pattern IdxDocument -split 250000 -prefix "biocon" -suffix "e2x" 7892 count := 0 7893 fnum := 0 7894 var ( 7895 fl *os.File 7896 err error 7897 ) 7898 chunk, err := strconv.Atoi(args[3]) 7899 if err != nil { 7900 fmt.Fprintf(os.Stderr, "%s\n", err.Error()) 7901 return 7902 } 7903 prefix := args[5] 7904 suffix := args[7] 7905 7906 eutils.PartitionPattern(topPattern, star, rdr, 7907 func(str string) { 7908 recordCount++ 7909 7910 if count >= chunk { 7911 if tail != "" { 7912 fl.WriteString(tail) 7913 fl.WriteString("\n") 7914 } 7915 fl.Close() 7916 count = 0 7917 } 7918 if count == 0 { 7919 fpath := fmt.Sprintf("%s%03d.%s", prefix, fnum, suffix) 7920 fl, err = os.Create(fpath) 7921 if err != nil { 7922 fmt.Fprintf(os.Stderr, "%s\n", err.Error()) 7923 return 7924 } 7925 os.Stderr.WriteString(fpath + "\n") 7926 fnum++ 7927 if head != "" { 7928 fl.WriteString(head) 7929 fl.WriteString("\n") 7930 } 7931 } 7932 count++ 7933 7934 fl.WriteString(str[:]) 7935 fl.WriteString("\n") 7936 }) 7937 7938 if count >= chunk { 7939 if tail != "" { 7940 fl.WriteString(tail) 7941 fl.WriteString("\n") 7942 } 7943 fl.Close() 7944 } 7945 7946 debug.FreeOSMemory() 7947 7948 if timr { 7949 printDuration("records") 7950 } 7951 7952 return 7953 } 7954 7955 // PARSE AND VALIDATE EXTRACTION ARGUMENTS 7956 7957 // parse nested exploration instruction from command-line arguments 7958 cmds := parseArguments(args, topPattern) 7959 if cmds == nil { 7960 fmt.Fprintf(os.Stderr, "\nERROR: Problem parsing command-line arguments\n") 7961 os.Exit(1) 7962 } 7963 7964 // GLOBAL MAP FOR SORT-UNIQ-COUNT HISTOGRAM ARGUMENT 7965 7966 histogram := make(map[string]int) 7967 7968 // PERFORMANCE TIMING COMMAND 7969 7970 // -stats with an extraction command prints XML size and processing time for each record 7971 if stts { 7972 7973 legend := "REC\tOFST\tSIZE\tTIME" 7974 7975 rec := 0 7976 7977 eutils.PartitionPattern(topPattern, star, rdr, 7978 func(str string) { 7979 rec++ 7980 beginTime := time.Now() 7981 processQuery(str[:], parent, rec, hd, tl, transform, histogram, cmds) 7982 endTime := time.Now() 7983 duration := endTime.Sub(beginTime) 7984 micro := int(float64(duration.Nanoseconds()) / 1e3) 7985 if legend != "" { 7986 fmt.Printf("%s\n", legend) 7987 legend = "" 7988 } 7989 fmt.Printf("%d\t%d\t%d\n", rec, len(str), micro) 7990 }) 7991 7992 return 7993 } 7994 7995 // PERFORMANCE OPTIMIZATION FUNCTION 7996 7997 // -trial -input fileName runs the specified extraction for each -proc from 1 to nCPU 7998 if trial && fileName != "" { 7999 8000 legend := "CPU\tRATE\tDEV" 8001 8002 for numServ := 1; numServ <= ncpu; numServ++ { 8003 8004 numServe = numServ 8005 8006 eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc) 8007 8008 runtime.GOMAXPROCS(numServ) 8009 8010 sum := 0 8011 count := 0 8012 mean := 0.0 8013 m2 := 0.0 8014 8015 // calculate mean and standard deviation of processing rate 8016 for trials := 0; trials < 5; trials++ { 8017 8018 inFile, err := os.Open(fileName) 8019 if err != nil { 8020 fmt.Fprintf(os.Stderr, "\nERROR: Unable to open input file '%s'\n", fileName) 8021 os.Exit(1) 8022 } 8023 8024 trdr := eutils.CreateXMLStreamer(inFile) 8025 if trdr == nil { 8026 fmt.Fprintf(os.Stderr, "\nERROR: Unable to read input file\n") 8027 os.Exit(1) 8028 } 8029 8030 xmlq := eutils.CreateXMLProducer(topPattern, star, trdr) 8031 tblq := createConsumers(cmds, parent, hd, tl, transform, histogram, xmlq) 8032 8033 if xmlq == nil || tblq == nil { 8034 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n") 8035 os.Exit(1) 8036 } 8037 8038 begTime := time.Now() 8039 recordCount = 0 8040 8041 for range tblq { 8042 recordCount++ 8043 runtime.Gosched() 8044 } 8045 8046 inFile.Close() 8047 8048 debug.FreeOSMemory() 8049 8050 endTime := time.Now() 8051 expended := endTime.Sub(begTime) 8052 secs := float64(expended.Nanoseconds()) / 1e9 8053 8054 if secs >= 0.000001 && recordCount > 0 { 8055 speed := int(float64(recordCount) / secs) 8056 sum += speed 8057 count++ 8058 x := float64(speed) 8059 delta := x - mean 8060 mean += delta / float64(count) 8061 m2 += delta * (x - mean) 8062 } 8063 } 8064 8065 if legend != "" { 8066 fmt.Printf("%s\n", legend) 8067 legend = "" 8068 } 8069 if count > 1 { 8070 vrc := m2 / float64(count-1) 8071 dev := int(math.Sqrt(vrc)) 8072 fmt.Printf("%d\t%d\t%d\n", numServ, sum/count, dev) 8073 } 8074 } 8075 8076 return 8077 } 8078 8079 // PROCESS SINGLE SELECTED RECORD IF -pattern ARGUMENT IS IMMEDIATELY FOLLOWED BY -position COMMAND 8080 8081 posn := "" 8082 if cmds.Visit == topPat { 8083 if cmds.Position == "outer" || 8084 cmds.Position == "inner" || 8085 cmds.Position == "even" || 8086 cmds.Position == "odd" || 8087 cmds.Position == "all" { 8088 // filter by record position when draining unshuffler channel 8089 posn = cmds.Position 8090 cmds.Position = "" 8091 } 8092 } 8093 8094 if cmds.Visit == topPat && cmds.Position != "" && cmds.Position != "select" { 8095 8096 qry := "" 8097 idx := 0 8098 rec := 0 8099 8100 if cmds.Position == "first" { 8101 8102 eutils.PartitionPattern(topPattern, star, rdr, 8103 func(str string) { 8104 rec++ 8105 if rec == 1 { 8106 qry = str 8107 idx = rec 8108 } 8109 }) 8110 8111 } else if cmds.Position == "last" { 8112 8113 eutils.PartitionPattern(topPattern, star, rdr, 8114 func(str string) { 8115 qry = str 8116 idx = rec 8117 }) 8118 8119 } else { 8120 8121 // use numeric position 8122 number, err := strconv.Atoi(cmds.Position) 8123 if err != nil { 8124 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized position '%s'\n", cmds.Position) 8125 os.Exit(1) 8126 } 8127 8128 eutils.PartitionPattern(topPattern, star, rdr, 8129 func(str string) { 8130 rec++ 8131 if rec == number { 8132 qry = str 8133 idx = rec 8134 } 8135 }) 8136 } 8137 8138 if qry == "" { 8139 return 8140 } 8141 8142 // clear position on top node to prevent condition test failure 8143 cmds.Position = "" 8144 8145 // process single selected record 8146 res := processQuery(qry[:], parent, idx, hd, tl, transform, histogram, cmds) 8147 8148 if res != "" { 8149 fmt.Printf("%s", res) 8150 } 8151 8152 return 8153 } 8154 8155 // LAUNCH PRODUCER, CONSUMER, AND UNSHUFFLER GOROUTINES 8156 8157 // launch producer goroutine to partition XML by pattern 8158 xmlq := eutils.CreateXMLProducer(topPattern, star, rdr) 8159 8160 // launch consumer goroutines to parse and explore partitioned XML objects 8161 tblq := createConsumers(cmds, parent, hd, tl, transform, histogram, xmlq) 8162 8163 // launch unshuffler goroutine to restore order of results 8164 unsq := eutils.CreateXMLUnshuffler(tblq) 8165 8166 if xmlq == nil || tblq == nil || unsq == nil { 8167 fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n") 8168 os.Exit(1) 8169 } 8170 8171 // PERFORMANCE SUMMARY 8172 8173 /* 8174 if dbug { 8175 8176 // drain results, but suppress extraction output 8177 for ext := range unsq { 8178 byteCount += len(ext.Text) 8179 recordCount++ 8180 runtime.Gosched() 8181 } 8182 8183 // force garbage collection, return memory to operating system 8184 debug.FreeOSMemory() 8185 8186 // print processing parameters as XML object 8187 stopTime := time.Now() 8188 duration := stopTime.Sub(StartTime) 8189 seconds := float64(duration.Nanoseconds()) / 1e9 8190 8191 // Threads is a more easily explained concept than GOMAXPROCS 8192 fmt.Printf("<Xtract>\n") 8193 fmt.Printf(" <Threads>%d</Threads>\n", numProcs) 8194 fmt.Printf(" <Parsers>%d</Parsers>\n", NumServe) 8195 fmt.Printf(" <Time>%.3f</Time>\n", seconds) 8196 if seconds >= 0.001 && recordCount > 0 { 8197 rate := int(float64(recordCount) / seconds) 8198 fmt.Printf(" <Rate>%d</Rate>\n", rate) 8199 } 8200 fmt.Printf("</Xtract>\n") 8201 8202 return 8203 } 8204 */ 8205 8206 // DRAIN OUTPUT CHANNEL TO EXECUTE EXTRACTION COMMANDS, RESTORE OUTPUT ORDER WITH HEAP 8207 8208 var buffer strings.Builder 8209 count := 0 8210 okay := false 8211 8212 wrtr := bufio.NewWriter(os.Stdout) 8213 8214 // printResult prints output for current pattern, handles -empty and -ident flags, and periodically flushes buffer 8215 printResult := func(curr eutils.XMLRecord) { 8216 8217 str := curr.Text 8218 8219 if mpty { 8220 8221 if str == "" { 8222 8223 okay = true 8224 8225 idx := curr.Index 8226 val := strconv.Itoa(idx) 8227 buffer.WriteString(val[:]) 8228 buffer.WriteString("\n") 8229 8230 count++ 8231 } 8232 8233 } else if str != "" { 8234 8235 okay = true 8236 8237 if idnt { 8238 idx := curr.Index 8239 val := strconv.Itoa(idx) 8240 buffer.WriteString(val[:]) 8241 buffer.WriteString("\t") 8242 } 8243 8244 // save output to byte buffer 8245 buffer.WriteString(str[:]) 8246 8247 count++ 8248 } 8249 8250 if count > 1000 { 8251 count = 0 8252 txt := buffer.String() 8253 if txt != "" { 8254 // print current buffer 8255 wrtr.WriteString(txt[:]) 8256 } 8257 buffer.Reset() 8258 } 8259 } 8260 8261 if head != "" { 8262 buffer.WriteString(head[:]) 8263 buffer.WriteString("\n") 8264 } 8265 8266 // drain unshuffler channel 8267 8268 if posn == "outer" { 8269 8270 // print only first and last records 8271 var beg *eutils.XMLRecord 8272 var end *eutils.XMLRecord 8273 8274 for curr := range unsq { 8275 8276 if beg == nil { 8277 beg = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text} 8278 } else { 8279 end = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text} 8280 } 8281 8282 recordCount++ 8283 } 8284 8285 if beg != nil { 8286 printResult(*beg) 8287 } 8288 if end != nil { 8289 printResult(*end) 8290 } 8291 8292 } else if posn == "inner" { 8293 8294 // print all but first and last records 8295 var prev *eutils.XMLRecord 8296 var next *eutils.XMLRecord 8297 first := true 8298 8299 for curr := range unsq { 8300 8301 if first { 8302 first = false 8303 } else { 8304 prev = next 8305 next = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text} 8306 } 8307 8308 if prev != nil { 8309 printResult(*prev) 8310 } 8311 8312 recordCount++ 8313 } 8314 8315 } else if posn == "even" { 8316 8317 even := false 8318 8319 for curr := range unsq { 8320 8321 if even { 8322 printResult(curr) 8323 } 8324 even = !even 8325 8326 recordCount++ 8327 } 8328 8329 } else if posn == "odd" { 8330 8331 odd := true 8332 8333 for curr := range unsq { 8334 8335 if odd { 8336 printResult(curr) 8337 } 8338 odd = !odd 8339 8340 recordCount++ 8341 } 8342 8343 } else { 8344 8345 // default or -position all 8346 for curr := range unsq { 8347 8348 // send result to output 8349 printResult(curr) 8350 8351 recordCount++ 8352 runtime.Gosched() 8353 } 8354 } 8355 8356 if tail != "" { 8357 buffer.WriteString(tail[:]) 8358 buffer.WriteString("\n") 8359 } 8360 8361 // do not print head or tail if no extraction output 8362 if okay { 8363 txt := buffer.String() 8364 if txt != "" { 8365 // print final buffer 8366 wrtr.WriteString(txt[:]) 8367 } 8368 } 8369 buffer.Reset() 8370 8371 wrtr.Flush() 8372 8373 // print -histogram results, if populated 8374 var keys []string 8375 for ky := range histogram { 8376 keys = append(keys, ky) 8377 } 8378 if len(keys) > 0 { 8379 // sort fields in alphabetical or numeric order 8380 sort.Slice(keys, func(i, j int) bool { 8381 // numeric sort on strings checks lengths first 8382 if eutils.IsAllDigits(keys[i]) && eutils.IsAllDigits(keys[j]) { 8383 lni := len(keys[i]) 8384 lnj := len(keys[j]) 8385 // shorter string is numerically less, assuming no leading zeros 8386 if lni < lnj { 8387 return true 8388 } 8389 if lni > lnj { 8390 return false 8391 } 8392 } 8393 // same length or non-numeric, can now do string comparison on contents 8394 return keys[i] < keys[j] 8395 }) 8396 8397 for _, str := range keys { 8398 8399 count := histogram[str] 8400 val := strconv.Itoa(count) 8401 os.Stdout.WriteString(val) 8402 os.Stdout.WriteString("\t") 8403 os.Stdout.WriteString(str) 8404 os.Stdout.WriteString("\n") 8405 } 8406 } 8407 8408 // force garbage collection and return memory before calculating processing rate 8409 debug.FreeOSMemory() 8410 8411 if timr { 8412 printDuration("records") 8413 } 8414} 8415