1// ===========================================================================
2//
3//                            PUBLIC DOMAIN NOTICE
4//            National Center for Biotechnology Information (NCBI)
5//
6//  This software/database is a "United States Government Work" under the
7//  terms of the United States Copyright Act. It was written as part of
8//  the author's official duties as a United States Government employee and
9//  thus cannot be copyrighted. This software/database is freely available
10//  to the public for use. The National Library of Medicine and the U.S.
11//  Government do not place any restriction on its use or reproduction.
12//  We would, however, appreciate having the NCBI and the author cited in
13//  any work or product based on this material.
14//
15//  Although all reasonable efforts have been taken to ensure the accuracy
16//  and reliability of the software and data, the NLM and the U.S.
17//  Government do not and cannot warrant the performance or results that
18//  may be obtained by using this software or data. The NLM and the U.S.
19//  Government disclaim all warranties, express or implied, including
20//  warranties of performance, merchantability or fitness for any particular
21//  purpose.
22//
23// ===========================================================================
24//
25// File Name:  xtract.go
26//
27// Author:  Jonathan Kans
28//
29// ==========================================================================
30
31package main
32
33import (
34	"bufio"
35	"encoding/base64"
36	"encoding/hex"
37	"eutils"
38	"fmt"
39	"github.com/fatih/color"
40	"github.com/surgebase/porter2"
41	"html"
42	"math"
43	"net/url"
44	"os"
45	"path"
46	"path/filepath"
47	"regexp"
48	"runtime"
49	"runtime/debug"
50	"runtime/pprof"
51	"sort"
52	"strconv"
53	"strings"
54	"sync"
55	"time"
56	"unicode"
57)
58
59// XTRACT HELP MESSAGE TEXT
60
61const xtractHelp = `
62Overview
63
64  Xtract uses command-line arguments to convert XML data into a tab-delimited table.
65
66  -pattern places the data from individual records into separate rows.
67
68  -element extracts values from specified fields into separate columns.
69
70  -group, -block, and -subset limit element exploration to selected XML subregions.
71
72Processing Flags
73
74  -strict          Remove HTML and MathML tags
75  -mixed           Allow mixed content XML
76
77  -accent          Excise Unicode accents and diacritical marks
78  -ascii           Unicode to numeric HTML character entities
79  -compress        Compress runs of spaces
80
81  -stops           Retain stop words in selected phrases
82
83Data Source
84
85  -input           Read XML from file instead of stdin
86  -transform       File of substitutions for -translate
87
88Exploration Argument Hierarchy
89
90  -pattern         Name of record within set
91  -group             Use of different argument
92  -block               names allows command-line
93  -subset                control of nested looping
94
95Path Navigation
96
97  -path            Explore by list of adjacent object names
98
99Exploration Constructs
100
101  Object           DateRevised
102  Parent/Child     Book/AuthorList
103  Path             MedlineCitation/Article/Journal/JournalIssue/PubDate
104  Heterogeneous    "PubmedArticleSet/*"
105  Exhaustive       "History/**"
106  Nested           "*/Taxon"
107  Recursive        "**/Gene-commentary"
108
109Conditional Execution
110
111  -if              Element [@attribute] required
112  -unless          Skip if element matches
113  -and             All tests must pass
114  -or              Any passing test suffices
115  -else            Execute if conditional test failed
116  -position        [first|last|outer|inner|even|odd|all]
117
118String Constraints
119
120  -equals          String must match exactly
121  -contains        Substring must be present
122  -is-within       String must be present
123  -starts-with     Substring must be at beginning
124  -ends-with       Substring must be at end
125  -is-not          String must not match
126  -is-before       First string < second string
127  -is-after        First string > second string
128  -matches         Matches without commas or semicolons
129  -resembles       Requires all words, but in any order
130
131Object Constraints
132
133  -is-equal-to     Object values must match
134  -differs-from    Object values must differ
135
136Numeric Constraints
137
138  -gt              Greater than
139  -ge              Greater than or equal to
140  -lt              Less than
141  -le              Less than or equal to
142  -eq              Equal to
143  -ne              Not equal to
144
145Format Customization
146
147  -ret             Override line break between patterns
148  -tab             Replace tab character between fields
149  -sep             Separator between group members
150  -pfx             Prefix to print before group
151  -sfx             Suffix to print after group
152  -rst             Reset -sep through -elg
153  -clr             Clear queued tab separator
154  -pfc             Preface combines -clr and -pfx
155  -deq             Delete and replace queued tab separator
156  -def             Default placeholder for missing fields
157  -lbl             Insert arbitrary text
158
159XML Generation
160
161  -set             XML tag for entire set
162  -rec             XML tag for each record
163
164  -wrp             Wrap elements in XML object
165
166  -enc             Encase instance in XML object
167  -plg             Prologue to print before instance
168  -elg             Epilogue to print after instance
169
170  -pkg             Package subset in XML object
171  -fwd             Foreword to print before subset
172  -awd             Afterword to print after subset
173
174Element Selection
175
176  -element         Print all items that match tag name
177  -first           Only print value of first item
178  -last            Only print value of last item
179  -NAME            Record value in named variable
180  --STATS          Accumulate values into variable
181
182-element Constructs
183
184  Tag              Caption
185  Group            Initials,LastName
186  Parent/Child     MedlineCitation/PMID
187  Recursive        "**/Gene-commentary_accession"
188  Unrestricted     "PubDate/*"
189  Attribute        DescriptorName@MajorTopicYN
190  Range            MedlineDate[1:4]
191  Substring        "Title[phospholipase | rattlesnake]"
192  Object Count     "#Author"
193  Item Length      "%Title"
194  Element Depth    "^PMID"
195  Variable         "&NAME"
196
197Special -element Operations
198
199  Parent Index     "+"
200  Object Name      "?"
201  XML Subtree      "*"
202  Children         "$"
203  Attributes       "@"
204
205Numeric Processing
206
207  -num             Count
208  -len             Length
209  -sum             Sum
210  -min             Minimum
211  -max             Maximum
212  -inc             Increment
213  -dec             Decrement
214  -sub             Difference
215  -avg             Average
216  -dev             Deviation
217  -med             Median
218  -mul             Product
219  -div             Quotient
220  -mod             Remainder
221  -bin             Binary
222  -bit             Bit Count
223
224String Processing
225
226  -encode          XML-encode <, >, &, ", and ' characters
227  -plain           Remove embedded mixed-content markup tags
228  -upper           Convert text to upper-case
229  -lower           Convert text to lower-case
230  -chain           Change_spaces_to_underscores
231  -title           Capitalize initial letters of words
232  -year            Extract first 4-digit year from string
233  -doi             Add https://doi.org/ prefix, URL encode
234  -translate       Substitute values with -transform table
235
236Text Processing
237
238  -terms           Partition text at spaces
239  -words           Split at punctuation marks
240  -pairs           Adjacent informative words
241  -order           Rearrange words in sorted order
242  -reverse         Reverse words in string
243  -letters         Separate individual letters
244  -clauses         Break at phrase separators
245
246Regular Expression
247
248  -replace         Substitute text using regular expressions
249
250  -reg             Target expression
251  -exp             Replacement pattern
252
253Sequence Processing
254
255  -revcomp         Reverse complement nucleotide sequence
256  -nucleic         Subrange determines forward or revcomp
257  -fasta           Split sequence into blocks of 50 letters
258  -ncbi2na         Expand ncbi2na to iupac
259  -ncbi4na         Expand ncbi4na to iupac
260                     (May need to truncate result to actual sequence length)
261  -molwt           Calculate molecular weight of peptide
262
263Sequence Coordinates
264
265  -0-based         Zero-Based
266  -1-based         One-Based
267  -ucsc-based      Half-Open
268
269Command Generator
270
271  -insd            Generate INSDSeq extraction commands
272
273-insd Argument Order
274
275  Descriptors      INSDSeq_sequence INSDSeq_definition INSDSeq_division
276  Flags            [complete|partial]
277  Feature(s)       CDS,mRNA
278  Qualifiers       INSDFeature_key "#INSDInterval" gene product feat_location sub_sequence
279
280Variation Processing
281
282  -hgvs            Convert sequence variation format to XML
283
284Frequency Table
285
286  -histogram       Collects data for sort-uniq-count on entire set of records
287
288Entrez Indexing
289
290  -e2index         Create Entrez index XML
291  -indices         Index normalized words
292
293Output Organization
294
295  -head            Print before everything else
296  -tail            Print after everything else
297  -hd              Print before each record
298  -tl              Print after each record
299
300Record Selection
301
302  -select          Select record subset by conditions
303  -in              File of identifiers to use for selection
304
305Record Rearrangement
306
307  -sort            Element to use as sort key
308
309Reformatting
310
311  -format          [copy|compact|flush|indent|expand]
312
313Validation
314
315  -verify          Report XML data integrity problems
316
317Summary
318
319  -outline         Display outline of XML structure
320  -synopsis        Display individual XML paths
321  -contour         Display XML paths to leaf nodes
322                     [delimiter]
323
324Documentation
325
326  -help            Print this document
327  -examples        Examples of EDirect and xtract usage
328  -unix            Common Unix command arguments
329  -version         Print version number
330
331Notes
332
333  String constraints use case-insensitive comparisons.
334
335  Numeric constraints and selection arguments use integer values.
336
337  -num and -len selections are synonyms for Object Count (#) and Item Length (%).
338
339  -words, -pairs, -reverse, and -indices convert to lower case.
340
341  See transmute -help for data conversion and modification functions.
342
343Xtract Examples
344
345  -pattern DocumentSummary -element Id -first Name Title
346
347  -pattern "PubmedArticleSet/*" -block Author -sep " " -element Initials,LastName
348
349  -pattern PubmedArticle -block MeshHeading -if "@MajorTopicYN" -equals Y -sep " / " -element DescriptorName,QualifierName
350
351  -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop
352
353  -pattern Taxon -block "*/Taxon" -unless Rank -equals "no rank" -tab "\n" -element Rank,ScientificName
354
355  -pattern Entrezgene -block "**/Gene-commentary"
356
357  -block INSDReference -position 2
358
359  -subset INSDInterval -position last -POS INSDInterval_to -element "&SEQ[&POS+1:]"
360
361  -if Author -and Title
362
363  -if "#Author" -lt 6 -and "%Title" -le 70
364
365  -if DateRevised/Year -gt 2005
366
367  -if ChrStop -lt ChrStart
368
369  -if CommonName -contains mouse
370
371  -if "&ABST" -starts-with "Transposable elements"
372
373  -if MapLocation -element MapLocation -else -lbl "\-"
374
375  -if inserted_sequence -differs-from deleted_sequence
376
377  -min ChrStart,ChrStop
378
379  -max ExonCount
380
381  -inc position -element inserted_sequence
382
383  -1-based ChrStart
384
385  -insd CDS gene product protein_id translation
386
387  -insd complete mat_peptide "%peptide" product peptide
388
389  -insd CDS INSDInterval_iscomp@value INSDInterval_from INSDInterval_to
390
391  -pattern PubmedArticle -select PubDate/Year -eq 2015
392
393  -pattern PubmedArticle -select MedlineCitation/PMID -in file_of_pmids.txt
394
395  -wrp PubmedArticleSet -pattern PubmedArticle -sort MedlineCitation/PMID
396
397  -pattern PubmedArticle -split 5000 -prefix "subset" -suffix "xml"
398
399  -pattern PubmedBookArticle -path BookDocument.Book.AuthorList.Author -element LastName
400
401  -pattern PubmedArticle -group MedlineCitation/Article/Journal/JournalIssue/PubDate -year "PubDate/*"
402
403  -mixed -verify MedlineCitation/PMID -html
404
405Transmute Examples
406
407  transmute -j2x -set - -rec GeneRec
408
409  transmute -t2x -set Set -rec Rec -skip 1 Code Name
410
411  transmute -filter ExpXml decode content
412
413  transmute -filter LocationHist remove object
414
415  transmute -normalize pubmed
416
417  transmute -head "<PubmedArticleSet>" -tail "</PubmedArticleSet>" -pattern "PubmedArticleSet/*" -format
418`
419
420const xtractInternal = `
421Performance Default Overrides
422
423  -proc     Number of CPU processors used
424  -cons     Ratio of parsers to processors
425  -serv     Concurrent parser instances
426  -chan     Communication channel depth
427  -heap     Order restoration heap size
428  -farm     Node allocation buffer length
429  -gogc     Garbage collection tuning knob
430
431Internal Component Performance
432
433  -chunk    StreamBlocks
434  -split    StreamBlocks -> SplitPattern
435  -token    StreamBlocks -> StreamTokens
436
437Debugging
438
439  -debug    Display run-time parameter summary
440  -empty    Flag records with no output
441  -ident    Print record index numbers
442  -stats    Show processing time for each record
443  -timer    Report processing duration and rate
444  -trial    Optimize -proc value, requires -input
445
446Documentation
447
448  -keys     Keyboard navigation shortcuts
449  -unix     Common Unix commands
450
451Performance Tuning Script
452
453  XtractTrials() {
454    echo -e "<Trials>"
455    for tries in {1..5}
456    do
457      xtract -debug -input "$1" -proc "$2" -pattern PubmedArticle -element LastName
458    done
459    echo -e "</Trials>"
460  }
461
462  for proc in {1..8}
463  do
464    XtractTrials "carotene.xml" "$proc" |
465    xtract -pattern Trials -lbl "$proc" -avg Rate -dev Rate
466  done
467
468Processor Titration Results
469
470  1    27622    31
471  2    51799    312
472  3    74853    593
473  4    95867    1337
474  5    97171    4019
475  6    93460    2458
476  7    87467    1030
477  8    82448    2651
478
479Entrez Index Performance Measurement
480
481  IndexTrials() {
482    echo -e "<Trials>"
483    for tries in {1..5}
484    do
485      cat "$1" | xtract -debug -proc "$2" -e2index
486    done
487    echo -e "</Trials>"
488  }
489
490  for proc in {1..8}
491  do
492    IndexTrials "carotene.xml" "$proc" |
493    xtract -pattern Trials -lbl "$proc" -avg Rate -dev Rate
494  done
495
496MeSH Tree Index Preparation
497
498  ftp-cp nlmpubs.nlm.nih.gov online/mesh/MESH_FILES/xmlmesh desc2021.zip
499  unzip desc2021.zip
500  rm desc2021.zip
501
502  cat desc2021.xml |
503  xtract -pattern DescriptorRecord -element "DescriptorRecord/DescriptorUI" \
504    -sep "," -element TreeNumber > meshtree.txt
505
506Execution Profiling
507
508  cat carotene.xml > /dev/null
509  ./xtract -profile -timer -input carotene.xml -pattern PubmedArticle -element LastName > /dev/null
510  go tool pprof --pdf ./cpu.pprof > ~/Desktop/callgraph.pdf
511  rm cpu.pprof
512`
513
514const keyboardShortcuts = `
515Command History
516
517  Ctrl-n     Next command
518  Ctrl-p     Previous command
519
520Move Cursor Forward
521
522  Ctrl-e     To end of line
523  Ctrl-f     By one character
524  Esc-f      By one argument
525
526Move Cursor Backward
527
528  Ctrl-a     To beginning of line
529  Ctrl-b     By one character
530  Esc-b      By one argument
531
532Delete
533
534  Del        Previous character
535  Ctrl-d     Next character
536  Ctrl-k     To end of line
537  Ctrl-u     Entire line
538  Ctrl-w     Previous word
539  Esc-Del    Previous argument
540  Esc-d      Next argument
541
542Autocomplete
543
544  Tab        Completes directory or file names
545
546Program Control
547
548  Ctrl-c     Quit running program
549  ^x^y       Run last command replacing x with y
550  Ctrl-z     Suspend foreground job
551  kill %%    Quit suspended script
552`
553
554const unixCommands = `
555Process by Contents
556
557 sort      Sorts lines of text
558
559  -f       Ignore case
560  -n       Numeric comparison
561  -r       Reverse result order
562
563  -k       Field key (start,stop or first)
564  -u       Unique lines with identical keys
565
566  -b       Ignore leading blanks
567  -s       Stable sort
568  -t       Specify field separator
569
570 uniq      Removes repeated lines
571
572  -c       Count occurrences
573  -i       Ignore case
574
575  -f       Ignore first n fields
576  -s       Ignore first n characters
577
578  -d       Only output repeated lines
579  -u       Only output non-repeated lines
580
581 grep      Matches patterns using regular expressions
582
583  -i       Ignore case
584  -v       Invert search
585  -w       Search expression as a word
586  -x       Search expression as whole line
587
588  -e       Specify individual pattern
589
590  -c       Only count number of matches
591  -n       Print line numbers
592  -A       Number of lines after match
593  -B       Number of lines before match
594
595Regular Expressions
596
597 Characters
598
599  .        Any single character (except newline)
600  \w       Alphabetic [A-Za-z], numeric [0-9], or underscore (_)
601  \s       Whitespace (space or tab)
602  \        Escapes special characters
603  []       Matches any enclosed characters
604
605 Positions
606
607  ^        Beginning of line
608  $        End of line
609  \b       Word boundary
610
611 Repeat Matches
612
613  ?        0 or 1
614  *        0 or more
615  +        1 or more
616  {n}      Exactly n
617
618 Escape Sequences
619
620  \n       Line break
621  \t       Tab character
622
623Modify Contents
624
625 sed       Replaces text strings
626
627  -e       Specify individual expression
628  s///     Substitute
629     /g    Global
630     /I    Case-insensitive
631     /p    Print
632
633 tr        Translates characters
634
635  -d       Delete character
636  -s       Squeeze runs of characters
637
638 rev       Reverses characters on line
639
640Format Contents
641
642 column    Aligns columns by content width
643
644  -s       Specify field separator
645  -t       Create table
646
647 expand    Aligns columns to specified positions
648
649  -t       Tab positions
650
651 fold      Wraps lines at a specific width
652
653  -w       Line width
654  -s       Fold at spaces
655
656Filter by Position
657
658 cut       Removes parts of lines
659
660  -c       Characters to keep
661  -f       Fields to keep
662  -d       Specify field separator
663  -s       Suppress lines with no delimiters
664
665 head      Prints first lines
666
667  -n       Number of lines
668
669 tail      Prints last lines
670
671  -n       Number of lines
672
673Miscellaneous
674
675 wc        Counts words, lines, or characters
676
677  -c       Characters
678  -l       Lines
679  -w       Words
680
681 xargs     Constructs arguments
682
683  -n       Number of words per batch
684
685 mktemp    Make temporary file
686
687File Compression
688
689 tar       Archive files
690
691  -c       Create archive
692  -f       Name of output file
693  -z       Compress archive with gzip
694
695 gzip      Compress file
696
697  -k       Keep original file
698  -9       Best compression
699
700 unzip     Decompress .zip archive
701
702  -p       Pipe to stdout
703
704 gzcat     Decompress .gz archive and pipe to stdout
705
706Directory and File Navigation
707
708 cd        Changes directory
709
710  /        Root
711  ~        Home
712  .        Current
713  ..       Parent
714  -        Previous
715
716 ls        Lists file names
717
718  -1       One entry per line
719  -a       Show files beginning with dot (.)
720  -l       List in long format
721  -R       Recursively explore subdirectories
722  -S       Sort files by size
723  -t       Sort by most recently modified
724  .*       Current and parent directory
725
726 pwd       Prints working directory path
727
728File Redirection
729
730  <        Read stdin from file
731  >        Redirect stdout to file
732  >>       Append to file
733  2>       Redirect stderr
734  2>&1     Merge stderr into stdout
735  |        Pipe between programs
736  <(cmd)   Execute command, read results as file
737
738Shell Script Variables
739
740  $0       Name of script
741  $n       Nth argument
742  $#       Number of arguments
743  "$*"     Argument list as one argument
744  "$@"     Argument list as separate arguments
745  $?       Exit status of previous command
746
747Shell Script Tests
748
749  -d       Directory exists
750  -f       File exists
751  -s       File is not empty
752  -n       Length of string is non-zero
753  -x       File is executable
754  -z       Variable is empty or not set
755
756File and Directory Extraction
757
758           BAS=$(printf pubmed%03d $n)
759           DIR=$(dirname "$0")
760           FIL=$(basename "$0")
761
762Remove Prefix
763
764           FILE="example.tar.gz"
765  #        ${FILE#.*}  -> tar.gz
766  ##       ${FILE##.*} -> gz
767
768Remove Suffix
769
770           FILE="example.tar.gz"
771           TYPE="http://identifiers.org/uniprot_enzymes/"
772  %        ${FILE%.*}  -> example.tar
773           ${TYPE%/}   -> http://identifiers.org/uniprot_enzymes
774  %%       ${FILE%%.*} -> example
775`
776
777// GLOBAL VARIABLES
778
779var (
780	doStem bool
781	deStop bool
782)
783
784// TYPED CONSTANTS
785
786// LevelType is the integer type for exploration arguments
787type LevelType int
788
789// LevelType keys for exploration arguments
790const (
791	_ LevelType = iota
792	UNIT
793	SUBSET
794	SECTION
795	BLOCK
796	BRANCH
797	GROUP
798	DIVISION
799	PATH
800	PATTERN
801)
802
803// IndentType is the integer type for XML formatting
804type IndentType int
805
806// IndentType keys for XML formatting
807const (
808	SINGULARITY IndentType = iota
809	COMPACT
810	FLUSH
811	INDENT
812	SUBTREE
813	WRAPPED
814)
815
816// OpType is the integer type for operations
817type OpType int
818
819// OpType keys for operations
820const (
821	UNSET OpType = iota
822	ELEMENT
823	FIRST
824	LAST
825	ENCODE
826	DECODE
827	PLAIN
828	UPPER
829	LOWER
830	CHAIN
831	TITLE
832	ORDER
833	YEAR
834	DOI
835	TRANSLATE
836	REPLACE
837	TERMS
838	WORDS
839	PAIRS
840	REVERSE
841	LETTERS
842	CLAUSES
843	INDICES
844	MESHCODE
845	MATRIX
846	HISTOGRAM
847	ACCENTED
848	PFX
849	SFX
850	SEP
851	TAB
852	RET
853	LBL
854	CLR
855	PFC
856	DEQ
857	PLG
858	ELG
859	FWD
860	AWD
861	WRP
862	ENC
863	PKG
864	RST
865	DEF
866	REG
867	EXP
868	COLOR
869	POSITION
870	SELECT
871	IF
872	UNLESS
873	MATCH
874	AVOID
875	AND
876	OR
877	EQUALS
878	CONTAINS
879	ISWITHIN
880	STARTSWITH
881	ENDSWITH
882	ISNOT
883	ISBEFORE
884	ISAFTER
885	MATCHES
886	RESEMBLES
887	ISEQUALTO
888	DIFFERSFROM
889	GT
890	GE
891	LT
892	LE
893	EQ
894	NE
895	NUM
896	LEN
897	SUM
898	MIN
899	MAX
900	INC
901	DEC
902	SUB
903	AVG
904	DEV
905	MED
906	MUL
907	DIV
908	MOD
909	BIN
910	BIT
911	ZEROBASED
912	ONEBASED
913	UCSCBASED
914	REVCOMP
915	NUCLEIC
916	FASTA
917	NCBI2NA
918	NCBI4NA
919	MOLWT
920	HGVS
921	ELSE
922	VARIABLE
923	ACCUMULATOR
924	VALUE
925	QUESTION
926	STAR
927	DOLLAR
928	ATSIGN
929	COUNT
930	LENGTH
931	DEPTH
932	INDEX
933	UNRECOGNIZED
934)
935
936// ArgumentType is the integer type for argument classification
937type ArgumentType int
938
939// ArgumentType keys for argument classification
940const (
941	_ ArgumentType = iota
942	EXPLORATION
943	CONDITIONAL
944	EXTRACTION
945	CUSTOMIZATION
946)
947
948// RangeType is the integer type for element range choices
949type RangeType int
950
951// RangeType keys for element range choices
952const (
953	NORANGE RangeType = iota
954	STRINGRANGE
955	VARIABLERANGE
956	INTEGERRANGE
957)
958
959// SeqEndType is used for -ucsc-based decisions
960type SeqEndType int
961
962// SeqEndType keys for -ucsc-based decisions
963const (
964	_ SeqEndType = iota
965	ISSTART
966	ISSTOP
967	ISPOS
968)
969
970// SequenceType is used to record XML tag and position for -ucsc-based
971type SequenceType struct {
972	Based int
973	Which SeqEndType
974}
975
976// MUTEXES
977
978var hlock sync.Mutex
979
980var slock sync.RWMutex
981
982// ARGUMENT MAPS
983
984var argTypeIs = map[string]ArgumentType{
985	"-unit":         EXPLORATION,
986	"-Unit":         EXPLORATION,
987	"-subset":       EXPLORATION,
988	"-Subset":       EXPLORATION,
989	"-section":      EXPLORATION,
990	"-Section":      EXPLORATION,
991	"-block":        EXPLORATION,
992	"-Block":        EXPLORATION,
993	"-branch":       EXPLORATION,
994	"-Branch":       EXPLORATION,
995	"-group":        EXPLORATION,
996	"-Group":        EXPLORATION,
997	"-division":     EXPLORATION,
998	"-Division":     EXPLORATION,
999	"-path":         EXPLORATION,
1000	"-Path":         EXPLORATION,
1001	"-pattern":      EXPLORATION,
1002	"-Pattern":      EXPLORATION,
1003	"-position":     CONDITIONAL,
1004	"-select":       CONDITIONAL,
1005	"-if":           CONDITIONAL,
1006	"-unless":       CONDITIONAL,
1007	"-match":        CONDITIONAL,
1008	"-avoid":        CONDITIONAL,
1009	"-and":          CONDITIONAL,
1010	"-or":           CONDITIONAL,
1011	"-equals":       CONDITIONAL,
1012	"-contains":     CONDITIONAL,
1013	"-is-within":    CONDITIONAL,
1014	"-starts-with":  CONDITIONAL,
1015	"-ends-with":    CONDITIONAL,
1016	"-is-not":       CONDITIONAL,
1017	"-is-before":    CONDITIONAL,
1018	"-is-after":     CONDITIONAL,
1019	"-matches":      CONDITIONAL,
1020	"-resembles":    CONDITIONAL,
1021	"-is-equal-to":  CONDITIONAL,
1022	"-differs-from": CONDITIONAL,
1023	"-gt":           CONDITIONAL,
1024	"-ge":           CONDITIONAL,
1025	"-lt":           CONDITIONAL,
1026	"-le":           CONDITIONAL,
1027	"-eq":           CONDITIONAL,
1028	"-ne":           CONDITIONAL,
1029	"-element":      EXTRACTION,
1030	"-first":        EXTRACTION,
1031	"-last":         EXTRACTION,
1032	"-encode":       EXTRACTION,
1033	"-decode":       EXTRACTION,
1034	"-decode64":     EXTRACTION,
1035	"-plain":        EXTRACTION,
1036	"-upper":        EXTRACTION,
1037	"-lower":        EXTRACTION,
1038	"-chain":        EXTRACTION,
1039	"-title":        EXTRACTION,
1040	"-order":        EXTRACTION,
1041	"-year":         EXTRACTION,
1042	"-doi":          EXTRACTION,
1043	"-translate":    EXTRACTION,
1044	"-replace":      EXTRACTION,
1045	"-terms":        EXTRACTION,
1046	"-words":        EXTRACTION,
1047	"-pairs":        EXTRACTION,
1048	"-reverse":      EXTRACTION,
1049	"-letters":      EXTRACTION,
1050	"-clauses":      EXTRACTION,
1051	"-indices":      EXTRACTION,
1052	"-meshcode":     EXTRACTION,
1053	"-matrix":       EXTRACTION,
1054	"-histogram":    EXTRACTION,
1055	"-accented":     EXTRACTION,
1056	"-num":          EXTRACTION,
1057	"-len":          EXTRACTION,
1058	"-sum":          EXTRACTION,
1059	"-min":          EXTRACTION,
1060	"-max":          EXTRACTION,
1061	"-inc":          EXTRACTION,
1062	"-dec":          EXTRACTION,
1063	"-sub":          EXTRACTION,
1064	"-avg":          EXTRACTION,
1065	"-dev":          EXTRACTION,
1066	"-med":          EXTRACTION,
1067	"-mul":          EXTRACTION,
1068	"-div":          EXTRACTION,
1069	"-mod":          EXTRACTION,
1070	"-bin":          EXTRACTION,
1071	"-bit":          EXTRACTION,
1072	"-0-based":      EXTRACTION,
1073	"-zero-based":   EXTRACTION,
1074	"-1-based":      EXTRACTION,
1075	"-one-based":    EXTRACTION,
1076	"-ucsc":         EXTRACTION,
1077	"-ucsc-based":   EXTRACTION,
1078	"-ucsc-coords":  EXTRACTION,
1079	"-bed-based":    EXTRACTION,
1080	"-bed-coords":   EXTRACTION,
1081	"-revcomp":      EXTRACTION,
1082	"-nucleic":      EXTRACTION,
1083	"-fasta":        EXTRACTION,
1084	"-ncbi2na":      EXTRACTION,
1085	"-ncbi4na":      EXTRACTION,
1086	"-molwt":        EXTRACTION,
1087	"-hgvs":         EXTRACTION,
1088	"-else":         EXTRACTION,
1089	"-pfx":          CUSTOMIZATION,
1090	"-sfx":          CUSTOMIZATION,
1091	"-sep":          CUSTOMIZATION,
1092	"-tab":          CUSTOMIZATION,
1093	"-ret":          CUSTOMIZATION,
1094	"-lbl":          CUSTOMIZATION,
1095	"-clr":          CUSTOMIZATION,
1096	"-pfc":          CUSTOMIZATION,
1097	"-deq":          CUSTOMIZATION,
1098	"-plg":          CUSTOMIZATION,
1099	"-elg":          CUSTOMIZATION,
1100	"-fwd":          CUSTOMIZATION,
1101	"-awd":          CUSTOMIZATION,
1102	"-wrp":          CUSTOMIZATION,
1103	"-enc":          CUSTOMIZATION,
1104	"-pkg":          CUSTOMIZATION,
1105	"-rst":          CUSTOMIZATION,
1106	"-def":          CUSTOMIZATION,
1107	"-reg":          CUSTOMIZATION,
1108	"-exp":          CUSTOMIZATION,
1109	"-color":        CUSTOMIZATION,
1110}
1111
1112var opTypeIs = map[string]OpType{
1113	"-element":      ELEMENT,
1114	"-first":        FIRST,
1115	"-last":         LAST,
1116	"-encode":       ENCODE,
1117	"-decode":       DECODE,
1118	"-decode64":     DECODE,
1119	"-plain":        PLAIN,
1120	"-upper":        UPPER,
1121	"-lower":        LOWER,
1122	"-chain":        CHAIN,
1123	"-title":        TITLE,
1124	"-order":        ORDER,
1125	"-year":         YEAR,
1126	"-doi":          DOI,
1127	"-translate":    TRANSLATE,
1128	"-replace":      REPLACE,
1129	"-terms":        TERMS,
1130	"-words":        WORDS,
1131	"-pairs":        PAIRS,
1132	"-reverse":      REVERSE,
1133	"-letters":      LETTERS,
1134	"-clauses":      CLAUSES,
1135	"-indices":      INDICES,
1136	"-meshcode":     MESHCODE,
1137	"-matrix":       MATRIX,
1138	"-histogram":    HISTOGRAM,
1139	"-accented":     ACCENTED,
1140	"-pfx":          PFX,
1141	"-sfx":          SFX,
1142	"-sep":          SEP,
1143	"-tab":          TAB,
1144	"-ret":          RET,
1145	"-lbl":          LBL,
1146	"-clr":          CLR,
1147	"-pfc":          PFC,
1148	"-deq":          DEQ,
1149	"-plg":          PLG,
1150	"-elg":          ELG,
1151	"-fwd":          FWD,
1152	"-awd":          AWD,
1153	"-wrp":          WRP,
1154	"-enc":          ENC,
1155	"-pkg":          PKG,
1156	"-rst":          RST,
1157	"-def":          DEF,
1158	"-reg":          REG,
1159	"-exp":          EXP,
1160	"-color":        COLOR,
1161	"-position":     POSITION,
1162	"-select":       SELECT,
1163	"-if":           IF,
1164	"-unless":       UNLESS,
1165	"-match":        MATCH,
1166	"-avoid":        AVOID,
1167	"-and":          AND,
1168	"-or":           OR,
1169	"-equals":       EQUALS,
1170	"-contains":     CONTAINS,
1171	"-is-within":    ISWITHIN,
1172	"-starts-with":  STARTSWITH,
1173	"-ends-with":    ENDSWITH,
1174	"-is-not":       ISNOT,
1175	"-is-before":    ISBEFORE,
1176	"-is-after":     ISAFTER,
1177	"-matches":      MATCHES,
1178	"-resembles":    RESEMBLES,
1179	"-is-equal-to":  ISEQUALTO,
1180	"-differs-from": DIFFERSFROM,
1181	"-gt":           GT,
1182	"-ge":           GE,
1183	"-lt":           LT,
1184	"-le":           LE,
1185	"-eq":           EQ,
1186	"-ne":           NE,
1187	"-num":          NUM,
1188	"-len":          LEN,
1189	"-sum":          SUM,
1190	"-min":          MIN,
1191	"-max":          MAX,
1192	"-inc":          INC,
1193	"-dec":          DEC,
1194	"-sub":          SUB,
1195	"-avg":          AVG,
1196	"-dev":          DEV,
1197	"-med":          MED,
1198	"-mul":          MUL,
1199	"-div":          DIV,
1200	"-mod":          MOD,
1201	"-bin":          BIN,
1202	"-bit":          BIT,
1203	"-0-based":      ZEROBASED,
1204	"-zero-based":   ZEROBASED,
1205	"-1-based":      ONEBASED,
1206	"-one-based":    ONEBASED,
1207	"-ucsc":         UCSCBASED,
1208	"-ucsc-based":   UCSCBASED,
1209	"-ucsc-coords":  UCSCBASED,
1210	"-bed-based":    UCSCBASED,
1211	"-bed-coords":   UCSCBASED,
1212	"-revcomp":      REVCOMP,
1213	"-nucleic":      NUCLEIC,
1214	"-fasta":        FASTA,
1215	"-ncbi2na":      NCBI2NA,
1216	"-ncbi4na":      NCBI4NA,
1217	"-molwt":        MOLWT,
1218	"-hgvs":         HGVS,
1219	"-else":         ELSE,
1220}
1221
1222var sequenceTypeIs = map[string]SequenceType{
1223	"INSDSeq:INSDInterval_from":       {1, ISSTART},
1224	"INSDSeq:INSDInterval_to":         {1, ISSTOP},
1225	"DocumentSummary:ChrStart":        {0, ISSTART},
1226	"DocumentSummary:ChrStop":         {0, ISSTOP},
1227	"DocumentSummary:Chr_start":       {1, ISSTART},
1228	"DocumentSummary:Chr_end":         {1, ISSTOP},
1229	"DocumentSummary:Chr_inner_start": {1, ISSTART},
1230	"DocumentSummary:Chr_inner_end":   {1, ISSTOP},
1231	"DocumentSummary:Chr_outer_start": {1, ISSTART},
1232	"DocumentSummary:Chr_outer_end":   {1, ISSTOP},
1233	"DocumentSummary:start":           {1, ISSTART},
1234	"DocumentSummary:stop":            {1, ISSTOP},
1235	"DocumentSummary:display_start":   {1, ISSTART},
1236	"DocumentSummary:display_stop":    {1, ISSTOP},
1237	"Entrezgene:Seq-interval_from":    {0, ISSTART},
1238	"Entrezgene:Seq-interval_to":      {0, ISSTOP},
1239	"GenomicInfoType:ChrStart":        {0, ISSTART},
1240	"GenomicInfoType:ChrStop":         {0, ISSTOP},
1241	"RS:position":                     {0, ISPOS},
1242	"RS:@asnFrom":                     {0, ISSTART},
1243	"RS:@asnTo":                       {0, ISSTOP},
1244	"RS:@end":                         {0, ISSTOP},
1245	"RS:@leftContigNeighborPos":       {0, ISSTART},
1246	"RS:@physMapInt":                  {0, ISPOS},
1247	"RS:@protLoc":                     {0, ISPOS},
1248	"RS:@rightContigNeighborPos":      {0, ISSTOP},
1249	"RS:@start":                       {0, ISSTART},
1250	"RS:@structLoc":                   {0, ISPOS},
1251}
1252
1253/*
1254	var conv = []string{"A", "C", "G", "T"}
1255	for i := 0; i < 4; i++ {
1256		for j := 0; j < 4; j++ {
1257			for k := 0; k < 4; k++ {
1258				for l := 0; l < 4; l++ {
1259					base := conv[i] + conv[j] + conv[k] + conv[l]
1260					idx := i*64 + j*16 + k*4 + l
1261					fmt.Fprintf(os.Stdout, "\t%d: \"%s\",\n", idx, base)
1262				}
1263			}
1264		}
1265	}
1266*/
1267
1268var ncbi2naToIupac = map[int]string{
1269	0:   "AAAA",
1270	1:   "AAAC",
1271	2:   "AAAG",
1272	3:   "AAAT",
1273	4:   "AACA",
1274	5:   "AACC",
1275	6:   "AACG",
1276	7:   "AACT",
1277	8:   "AAGA",
1278	9:   "AAGC",
1279	10:  "AAGG",
1280	11:  "AAGT",
1281	12:  "AATA",
1282	13:  "AATC",
1283	14:  "AATG",
1284	15:  "AATT",
1285	16:  "ACAA",
1286	17:  "ACAC",
1287	18:  "ACAG",
1288	19:  "ACAT",
1289	20:  "ACCA",
1290	21:  "ACCC",
1291	22:  "ACCG",
1292	23:  "ACCT",
1293	24:  "ACGA",
1294	25:  "ACGC",
1295	26:  "ACGG",
1296	27:  "ACGT",
1297	28:  "ACTA",
1298	29:  "ACTC",
1299	30:  "ACTG",
1300	31:  "ACTT",
1301	32:  "AGAA",
1302	33:  "AGAC",
1303	34:  "AGAG",
1304	35:  "AGAT",
1305	36:  "AGCA",
1306	37:  "AGCC",
1307	38:  "AGCG",
1308	39:  "AGCT",
1309	40:  "AGGA",
1310	41:  "AGGC",
1311	42:  "AGGG",
1312	43:  "AGGT",
1313	44:  "AGTA",
1314	45:  "AGTC",
1315	46:  "AGTG",
1316	47:  "AGTT",
1317	48:  "ATAA",
1318	49:  "ATAC",
1319	50:  "ATAG",
1320	51:  "ATAT",
1321	52:  "ATCA",
1322	53:  "ATCC",
1323	54:  "ATCG",
1324	55:  "ATCT",
1325	56:  "ATGA",
1326	57:  "ATGC",
1327	58:  "ATGG",
1328	59:  "ATGT",
1329	60:  "ATTA",
1330	61:  "ATTC",
1331	62:  "ATTG",
1332	63:  "ATTT",
1333	64:  "CAAA",
1334	65:  "CAAC",
1335	66:  "CAAG",
1336	67:  "CAAT",
1337	68:  "CACA",
1338	69:  "CACC",
1339	70:  "CACG",
1340	71:  "CACT",
1341	72:  "CAGA",
1342	73:  "CAGC",
1343	74:  "CAGG",
1344	75:  "CAGT",
1345	76:  "CATA",
1346	77:  "CATC",
1347	78:  "CATG",
1348	79:  "CATT",
1349	80:  "CCAA",
1350	81:  "CCAC",
1351	82:  "CCAG",
1352	83:  "CCAT",
1353	84:  "CCCA",
1354	85:  "CCCC",
1355	86:  "CCCG",
1356	87:  "CCCT",
1357	88:  "CCGA",
1358	89:  "CCGC",
1359	90:  "CCGG",
1360	91:  "CCGT",
1361	92:  "CCTA",
1362	93:  "CCTC",
1363	94:  "CCTG",
1364	95:  "CCTT",
1365	96:  "CGAA",
1366	97:  "CGAC",
1367	98:  "CGAG",
1368	99:  "CGAT",
1369	100: "CGCA",
1370	101: "CGCC",
1371	102: "CGCG",
1372	103: "CGCT",
1373	104: "CGGA",
1374	105: "CGGC",
1375	106: "CGGG",
1376	107: "CGGT",
1377	108: "CGTA",
1378	109: "CGTC",
1379	110: "CGTG",
1380	111: "CGTT",
1381	112: "CTAA",
1382	113: "CTAC",
1383	114: "CTAG",
1384	115: "CTAT",
1385	116: "CTCA",
1386	117: "CTCC",
1387	118: "CTCG",
1388	119: "CTCT",
1389	120: "CTGA",
1390	121: "CTGC",
1391	122: "CTGG",
1392	123: "CTGT",
1393	124: "CTTA",
1394	125: "CTTC",
1395	126: "CTTG",
1396	127: "CTTT",
1397	128: "GAAA",
1398	129: "GAAC",
1399	130: "GAAG",
1400	131: "GAAT",
1401	132: "GACA",
1402	133: "GACC",
1403	134: "GACG",
1404	135: "GACT",
1405	136: "GAGA",
1406	137: "GAGC",
1407	138: "GAGG",
1408	139: "GAGT",
1409	140: "GATA",
1410	141: "GATC",
1411	142: "GATG",
1412	143: "GATT",
1413	144: "GCAA",
1414	145: "GCAC",
1415	146: "GCAG",
1416	147: "GCAT",
1417	148: "GCCA",
1418	149: "GCCC",
1419	150: "GCCG",
1420	151: "GCCT",
1421	152: "GCGA",
1422	153: "GCGC",
1423	154: "GCGG",
1424	155: "GCGT",
1425	156: "GCTA",
1426	157: "GCTC",
1427	158: "GCTG",
1428	159: "GCTT",
1429	160: "GGAA",
1430	161: "GGAC",
1431	162: "GGAG",
1432	163: "GGAT",
1433	164: "GGCA",
1434	165: "GGCC",
1435	166: "GGCG",
1436	167: "GGCT",
1437	168: "GGGA",
1438	169: "GGGC",
1439	170: "GGGG",
1440	171: "GGGT",
1441	172: "GGTA",
1442	173: "GGTC",
1443	174: "GGTG",
1444	175: "GGTT",
1445	176: "GTAA",
1446	177: "GTAC",
1447	178: "GTAG",
1448	179: "GTAT",
1449	180: "GTCA",
1450	181: "GTCC",
1451	182: "GTCG",
1452	183: "GTCT",
1453	184: "GTGA",
1454	185: "GTGC",
1455	186: "GTGG",
1456	187: "GTGT",
1457	188: "GTTA",
1458	189: "GTTC",
1459	190: "GTTG",
1460	191: "GTTT",
1461	192: "TAAA",
1462	193: "TAAC",
1463	194: "TAAG",
1464	195: "TAAT",
1465	196: "TACA",
1466	197: "TACC",
1467	198: "TACG",
1468	199: "TACT",
1469	200: "TAGA",
1470	201: "TAGC",
1471	202: "TAGG",
1472	203: "TAGT",
1473	204: "TATA",
1474	205: "TATC",
1475	206: "TATG",
1476	207: "TATT",
1477	208: "TCAA",
1478	209: "TCAC",
1479	210: "TCAG",
1480	211: "TCAT",
1481	212: "TCCA",
1482	213: "TCCC",
1483	214: "TCCG",
1484	215: "TCCT",
1485	216: "TCGA",
1486	217: "TCGC",
1487	218: "TCGG",
1488	219: "TCGT",
1489	220: "TCTA",
1490	221: "TCTC",
1491	222: "TCTG",
1492	223: "TCTT",
1493	224: "TGAA",
1494	225: "TGAC",
1495	226: "TGAG",
1496	227: "TGAT",
1497	228: "TGCA",
1498	229: "TGCC",
1499	230: "TGCG",
1500	231: "TGCT",
1501	232: "TGGA",
1502	233: "TGGC",
1503	234: "TGGG",
1504	235: "TGGT",
1505	236: "TGTA",
1506	237: "TGTC",
1507	238: "TGTG",
1508	239: "TGTT",
1509	240: "TTAA",
1510	241: "TTAC",
1511	242: "TTAG",
1512	243: "TTAT",
1513	244: "TTCA",
1514	245: "TTCC",
1515	246: "TTCG",
1516	247: "TTCT",
1517	248: "TTGA",
1518	249: "TTGC",
1519	250: "TTGG",
1520	251: "TTGT",
1521	252: "TTTA",
1522	253: "TTTC",
1523	254: "TTTG",
1524	255: "TTTT",
1525}
1526
1527/*
1528	var conv = []string{"N", "A", "C", "M", "G", "R", "S", "V", "T", "W", "Y", "H", "K", "D", "B", "N"}
1529	for i := 0; i < 16; i++ {
1530		for j := 0; j < 16; j++ {
1531			base := conv[i] + conv[j]
1532			idx := i*16 + j
1533			fmt.Fprintf(os.Stdout, "\t%d: \"%s\",\n", idx, base)
1534		}
1535	}
1536*/
1537
1538var ncbi4naToIupac = map[int]string{
1539	0:   "NN",
1540	1:   "NA",
1541	2:   "NC",
1542	3:   "NM",
1543	4:   "NG",
1544	5:   "NR",
1545	6:   "NS",
1546	7:   "NV",
1547	8:   "NT",
1548	9:   "NW",
1549	10:  "NY",
1550	11:  "NH",
1551	12:  "NK",
1552	13:  "ND",
1553	14:  "NB",
1554	15:  "NN",
1555	16:  "AN",
1556	17:  "AA",
1557	18:  "AC",
1558	19:  "AM",
1559	20:  "AG",
1560	21:  "AR",
1561	22:  "AS",
1562	23:  "AV",
1563	24:  "AT",
1564	25:  "AW",
1565	26:  "AY",
1566	27:  "AH",
1567	28:  "AK",
1568	29:  "AD",
1569	30:  "AB",
1570	31:  "AN",
1571	32:  "CN",
1572	33:  "CA",
1573	34:  "CC",
1574	35:  "CM",
1575	36:  "CG",
1576	37:  "CR",
1577	38:  "CS",
1578	39:  "CV",
1579	40:  "CT",
1580	41:  "CW",
1581	42:  "CY",
1582	43:  "CH",
1583	44:  "CK",
1584	45:  "CD",
1585	46:  "CB",
1586	47:  "CN",
1587	48:  "MN",
1588	49:  "MA",
1589	50:  "MC",
1590	51:  "MM",
1591	52:  "MG",
1592	53:  "MR",
1593	54:  "MS",
1594	55:  "MV",
1595	56:  "MT",
1596	57:  "MW",
1597	58:  "MY",
1598	59:  "MH",
1599	60:  "MK",
1600	61:  "MD",
1601	62:  "MB",
1602	63:  "MN",
1603	64:  "GN",
1604	65:  "GA",
1605	66:  "GC",
1606	67:  "GM",
1607	68:  "GG",
1608	69:  "GR",
1609	70:  "GS",
1610	71:  "GV",
1611	72:  "GT",
1612	73:  "GW",
1613	74:  "GY",
1614	75:  "GH",
1615	76:  "GK",
1616	77:  "GD",
1617	78:  "GB",
1618	79:  "GN",
1619	80:  "RN",
1620	81:  "RA",
1621	82:  "RC",
1622	83:  "RM",
1623	84:  "RG",
1624	85:  "RR",
1625	86:  "RS",
1626	87:  "RV",
1627	88:  "RT",
1628	89:  "RW",
1629	90:  "RY",
1630	91:  "RH",
1631	92:  "RK",
1632	93:  "RD",
1633	94:  "RB",
1634	95:  "RN",
1635	96:  "SN",
1636	97:  "SA",
1637	98:  "SC",
1638	99:  "SM",
1639	100: "SG",
1640	101: "SR",
1641	102: "SS",
1642	103: "SV",
1643	104: "ST",
1644	105: "SW",
1645	106: "SY",
1646	107: "SH",
1647	108: "SK",
1648	109: "SD",
1649	110: "SB",
1650	111: "SN",
1651	112: "VN",
1652	113: "VA",
1653	114: "VC",
1654	115: "VM",
1655	116: "VG",
1656	117: "VR",
1657	118: "VS",
1658	119: "VV",
1659	120: "VT",
1660	121: "VW",
1661	122: "VY",
1662	123: "VH",
1663	124: "VK",
1664	125: "VD",
1665	126: "VB",
1666	127: "VN",
1667	128: "TN",
1668	129: "TA",
1669	130: "TC",
1670	131: "TM",
1671	132: "TG",
1672	133: "TR",
1673	134: "TS",
1674	135: "TV",
1675	136: "TT",
1676	137: "TW",
1677	138: "TY",
1678	139: "TH",
1679	140: "TK",
1680	141: "TD",
1681	142: "TB",
1682	143: "TN",
1683	144: "WN",
1684	145: "WA",
1685	146: "WC",
1686	147: "WM",
1687	148: "WG",
1688	149: "WR",
1689	150: "WS",
1690	151: "WV",
1691	152: "WT",
1692	153: "WW",
1693	154: "WY",
1694	155: "WH",
1695	156: "WK",
1696	157: "WD",
1697	158: "WB",
1698	159: "WN",
1699	160: "YN",
1700	161: "YA",
1701	162: "YC",
1702	163: "YM",
1703	164: "YG",
1704	165: "YR",
1705	166: "YS",
1706	167: "YV",
1707	168: "YT",
1708	169: "YW",
1709	170: "YY",
1710	171: "YH",
1711	172: "YK",
1712	173: "YD",
1713	174: "YB",
1714	175: "YN",
1715	176: "HN",
1716	177: "HA",
1717	178: "HC",
1718	179: "HM",
1719	180: "HG",
1720	181: "HR",
1721	182: "HS",
1722	183: "HV",
1723	184: "HT",
1724	185: "HW",
1725	186: "HY",
1726	187: "HH",
1727	188: "HK",
1728	189: "HD",
1729	190: "HB",
1730	191: "HN",
1731	192: "KN",
1732	193: "KA",
1733	194: "KC",
1734	195: "KM",
1735	196: "KG",
1736	197: "KR",
1737	198: "KS",
1738	199: "KV",
1739	200: "KT",
1740	201: "KW",
1741	202: "KY",
1742	203: "KH",
1743	204: "KK",
1744	205: "KD",
1745	206: "KB",
1746	207: "KN",
1747	208: "DN",
1748	209: "DA",
1749	210: "DC",
1750	211: "DM",
1751	212: "DG",
1752	213: "DR",
1753	214: "DS",
1754	215: "DV",
1755	216: "DT",
1756	217: "DW",
1757	218: "DY",
1758	219: "DH",
1759	220: "DK",
1760	221: "DD",
1761	222: "DB",
1762	223: "DN",
1763	224: "BN",
1764	225: "BA",
1765	226: "BC",
1766	227: "BM",
1767	228: "BG",
1768	229: "BR",
1769	230: "BS",
1770	231: "BV",
1771	232: "BT",
1772	233: "BW",
1773	234: "BY",
1774	235: "BH",
1775	236: "BK",
1776	237: "BD",
1777	238: "BB",
1778	239: "BN",
1779	240: "NN",
1780	241: "NA",
1781	242: "NC",
1782	243: "NM",
1783	244: "NG",
1784	245: "NR",
1785	246: "NS",
1786	247: "NV",
1787	248: "NT",
1788	249: "NW",
1789	250: "NY",
1790	251: "NH",
1791	252: "NK",
1792	253: "ND",
1793	254: "NB",
1794	255: "NN",
1795}
1796
1797// DATA OBJECTS
1798
1799// Step contains parameters for executing a single command step
1800type Step struct {
1801	Type   OpType
1802	Value  string
1803	Parent string
1804	Match  string
1805	Attrib string
1806	TypL   RangeType
1807	StrL   string
1808	IntL   int
1809	TypR   RangeType
1810	StrR   string
1811	IntR   int
1812	Norm   bool
1813	Wild   bool
1814}
1815
1816// Operation breaks commands into sequential steps
1817type Operation struct {
1818	Type   OpType
1819	Value  string
1820	Stages []*Step
1821}
1822
1823// Block contains nested instructions for executing commands
1824type Block struct {
1825	Visit      string
1826	Parent     string
1827	Match      string
1828	Path       []string
1829	Working    []string
1830	Parsed     []string
1831	Position   string
1832	Foreword   string
1833	Afterword  string
1834	Conditions []*Operation
1835	Commands   []*Operation
1836	Failure    []*Operation
1837	Subtasks   []*Block
1838}
1839
1840// Limiter is used for collecting specific nodes (e.g., first and last)
1841type Limiter struct {
1842	Obj *eutils.XMLNode
1843	Idx int
1844	Lvl int
1845}
1846
1847// UTILITIES
1848
1849func hasSpaceOrHyphen(str string) bool {
1850
1851	for _, ch := range str {
1852		if ch == ' ' || ch == '-' {
1853			return true
1854		}
1855	}
1856
1857	return false
1858}
1859
1860func isAllCapsOrDigits(str string) bool {
1861
1862	for _, ch := range str {
1863		if !unicode.IsUpper(ch) && !unicode.IsDigit(ch) {
1864			return false
1865		}
1866	}
1867
1868	return true
1869}
1870
1871// sortStringByWords sorts the individual words in a string
1872func sortStringByWords(str string) string {
1873
1874	str = eutils.RemoveCommaOrSemicolon(str)
1875
1876	// check for multiple words
1877	if hasSpaceOrHyphen(str) {
1878		flds := strings.Fields(str)
1879		sort.Slice(flds, func(i, j int) bool { return flds[i] < flds[j] })
1880		str = strings.Join(flds, " ")
1881		str = strings.Replace(str, "-", " ", -1)
1882		str = eutils.CompressRunsOfSpaces(str)
1883		str = strings.TrimRight(str, ".?:")
1884	}
1885
1886	return str
1887}
1888
1889func parseFlag(str string) OpType {
1890
1891	op, ok := opTypeIs[str]
1892	if ok {
1893		return op
1894	}
1895
1896	if len(str) > 1 && str[0] == '-' && isAllCapsOrDigits(str[1:]) {
1897		return VARIABLE
1898	}
1899
1900	if len(str) > 2 && strings.HasPrefix(str, "--") && isAllCapsOrDigits(str[2:]) {
1901		return ACCUMULATOR
1902	}
1903
1904	if len(str) > 0 && str[0] == '-' {
1905		return UNRECOGNIZED
1906	}
1907
1908	return UNSET
1909}
1910
1911func parseMarkup(str, cmd string) int {
1912
1913	switch str {
1914	case "fuse", "fused":
1915		return eutils.FUSE
1916	case "space", "spaces":
1917		return eutils.SPACE
1918	case "period", "periods":
1919		return eutils.PERIOD
1920	case "bracket", "brackets":
1921		return eutils.BRACKETS
1922	case "markdown":
1923		return eutils.MARKDOWN
1924	case "slash":
1925		return eutils.SLASH
1926	case "tag", "tags":
1927		return eutils.TAGS
1928	case "terse":
1929		return eutils.TERSE
1930	default:
1931		if str != "" {
1932			fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized %s value '%s'\n", cmd, str)
1933			os.Exit(1)
1934		}
1935	}
1936	return eutils.NOMARKUP
1937}
1938
1939// DebugBlock examines structure of parsed arguments (undocumented)
1940/*
1941func DebugBlock(blk *Block, depth int) {
1942
1943	doIndent := func(indt int) {
1944		for i := 1; i < indt; i++ {
1945			fmt.Fprintf(os.Stderr, "  ")
1946		}
1947	}
1948
1949	doIndent(depth)
1950
1951	if blk.Visit != "" {
1952		doIndent(depth + 1)
1953		fmt.Fprintf(os.Stderr, "<Visit> %s </Visit>\n", blk.Visit)
1954	}
1955	if len(blk.Parsed) > 0 {
1956		doIndent(depth + 1)
1957		fmt.Fprintf(os.Stderr, "<Parsed>")
1958		for _, str := range blk.Parsed {
1959			fmt.Fprintf(os.Stderr, " %s", str)
1960		}
1961		fmt.Fprintf(os.Stderr, " </Parsed>\n")
1962	}
1963
1964	if len(blk.Subtasks) > 0 {
1965		for _, sub := range blk.Subtasks {
1966			DebugBlock(sub, depth+1)
1967		}
1968	}
1969}
1970*/
1971
1972// PARSE COMMAND-LINE ARGUMENTS
1973
1974// parseArguments parses nested exploration instruction from command-line arguments
1975func parseArguments(cmdargs []string, pttrn string) *Block {
1976
1977	// different names of exploration control arguments allow multiple levels of nested "for" loops in a linear command line
1978	// (capitalized versions for backward-compatibility with original Perl implementation handling of recursive definitions)
1979	var (
1980		lcname = []string{
1981			"",
1982			"-unit",
1983			"-subset",
1984			"-section",
1985			"-block",
1986			"-branch",
1987			"-group",
1988			"-division",
1989			"-path",
1990			"-pattern",
1991		}
1992
1993		ucname = []string{
1994			"",
1995			"-Unit",
1996			"-Subset",
1997			"-Section",
1998			"-Block",
1999			"-Branch",
2000			"-Group",
2001			"-Division",
2002			"-Path",
2003			"-Pattern",
2004		}
2005	)
2006
2007	// parseCommands recursive definition
2008	var parseCommands func(parent *Block, startLevel LevelType)
2009
2010	// parseCommands does initial parsing of exploration command structure
2011	parseCommands = func(parent *Block, startLevel LevelType) {
2012
2013		// find next highest level exploration argument
2014		findNextLevel := func(args []string, level LevelType) (LevelType, string, string) {
2015
2016			if len(args) > 1 {
2017
2018				for {
2019
2020					if level < UNIT {
2021						break
2022					}
2023
2024					lctag := lcname[level]
2025					uctag := ucname[level]
2026
2027					for _, txt := range args {
2028						if txt == lctag || txt == uctag {
2029							return level, lctag, uctag
2030						}
2031					}
2032
2033					level--
2034				}
2035			}
2036
2037			return 0, "", ""
2038		}
2039
2040		arguments := parent.Working
2041
2042		level, lctag, uctag := findNextLevel(arguments, startLevel)
2043
2044		if level < UNIT {
2045
2046			// break recursion
2047			return
2048		}
2049
2050		// group arguments at a given exploration level
2051		subsetCommands := func(args []string) *Block {
2052
2053			max := len(args)
2054
2055			visit := ""
2056
2057			// extract name of object to visit
2058			if max > 1 {
2059				visit = args[1]
2060				args = args[2:]
2061				max -= 2
2062			}
2063
2064			partition := 0
2065			for cur, str := range args {
2066
2067				// record point of next exploration command
2068				partition = cur + 1
2069
2070				// skip if not a command
2071				if len(str) < 1 || str[0] != '-' {
2072					continue
2073				}
2074
2075				if argTypeIs[str] == EXPLORATION {
2076					partition = cur
2077					break
2078				}
2079			}
2080
2081			// convert slashes (e.g., parent/child construct) to periods (e.g., dotted exploration path)
2082			if strings.Contains(visit, "/") {
2083				if !strings.Contains(visit, ".") {
2084					visit = strings.Replace(visit, "/", ".", -1)
2085				}
2086			}
2087
2088			// parse parent.child or dotted path construct
2089			// colon indicates a namespace prefix in any or all of the components
2090			prnt, rmdr := eutils.SplitInTwoRight(visit, ".")
2091			match, rest := eutils.SplitInTwoLeft(rmdr, ".")
2092
2093			if rest != "" {
2094
2095				// exploration match on first component, then search remainder one level at a time with subsequent components
2096				dirs := strings.Split(rmdr, ".")
2097
2098				// signal with "path" position
2099				return &Block{Visit: visit, Parent: "", Match: prnt, Path: dirs, Position: "path", Parsed: args[0:partition], Working: args[partition:]}
2100			}
2101
2102			// promote arguments parsed at this level
2103			return &Block{Visit: visit, Parent: prnt, Match: match, Parsed: args[0:partition], Working: args[partition:]}
2104		}
2105
2106		cur := 0
2107
2108		// search for positions of current exploration command
2109
2110		for idx, txt := range arguments {
2111			if txt == lctag || txt == uctag {
2112				if idx == 0 {
2113					continue
2114				}
2115
2116				blk := subsetCommands(arguments[cur:idx])
2117				parseCommands(blk, level-1)
2118				parent.Subtasks = append(parent.Subtasks, blk)
2119
2120				cur = idx
2121			}
2122		}
2123
2124		if cur < len(arguments) {
2125			blk := subsetCommands(arguments[cur:])
2126			parseCommands(blk, level-1)
2127			parent.Subtasks = append(parent.Subtasks, blk)
2128		}
2129
2130		// clear execution arguments from parent after subsetting
2131		parent.Working = nil
2132	}
2133
2134	// parse optional [min:max], [&VAR:&VAR], or [after|before] range specification
2135	parseRange := func(item, rnge string) (typL RangeType, strL string, intL int, typR RangeType, strR string, intR int) {
2136
2137		typL = NORANGE
2138		typR = NORANGE
2139		strL = ""
2140		strR = ""
2141		intL = 0
2142		intR = 0
2143
2144		if rnge == "" {
2145			// no range specification, return default values
2146			return
2147		}
2148
2149		// check if last character is right square bracket
2150		if !strings.HasSuffix(rnge, "]") {
2151			fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized range %s\n", rnge)
2152			os.Exit(1)
2153		}
2154
2155		rnge = strings.TrimSuffix(rnge, "]")
2156
2157		if rnge == "" {
2158			fmt.Fprintf(os.Stderr, "\nERROR: Empty range %s[]\n", item)
2159			os.Exit(1)
2160		}
2161
2162		// check for [after|before] variant
2163		if strings.Contains(rnge, "|") {
2164
2165			strL, strR = eutils.SplitInTwoLeft(rnge, "|")
2166			// spacing matters, so do not call TrimSpace
2167
2168			if strL == "" && strR == "" {
2169				fmt.Fprintf(os.Stderr, "\nERROR: Empty range %s[|]\n", item)
2170				os.Exit(1)
2171			}
2172
2173			typL = STRINGRANGE
2174			typR = STRINGRANGE
2175
2176			// return statement returns named variables
2177			return
2178		}
2179
2180		// otherwise must have colon within brackets
2181		if !strings.Contains(rnge, ":") {
2182			fmt.Fprintf(os.Stderr, "\nERROR: Colon missing in range %s[%s]\n", item, rnge)
2183			os.Exit(1)
2184		}
2185
2186		// split at colon
2187		lft, rgt := eutils.SplitInTwoLeft(rnge, ":")
2188
2189		lft = strings.TrimSpace(lft)
2190		rgt = strings.TrimSpace(rgt)
2191
2192		if lft == "" && rgt == "" {
2193			fmt.Fprintf(os.Stderr, "\nERROR: Empty range %s[:]\n", item)
2194			os.Exit(1)
2195		}
2196
2197		// for variable, parse optional +/- offset suffix
2198		parseOffset := func(str string) (string, int) {
2199
2200			if str == "" || str[0] == ' ' {
2201				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '&%s'\n", str)
2202				os.Exit(1)
2203			}
2204
2205			pls := ""
2206			mns := ""
2207
2208			ofs := 0
2209
2210			// check for &VAR+1 or &VAR-1 integer adjustment
2211			str, pls = eutils.SplitInTwoLeft(str, "+")
2212			str, mns = eutils.SplitInTwoLeft(str, "-")
2213
2214			if pls != "" {
2215				val, err := strconv.Atoi(pls)
2216				if err != nil {
2217					fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized range adjustment &%s+%s\n", str, pls)
2218					os.Exit(1)
2219				}
2220				ofs = val
2221			} else if mns != "" {
2222				val, err := strconv.Atoi(mns)
2223				if err != nil {
2224					fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized range adjustment &%s-%s\n", str, mns)
2225					os.Exit(1)
2226				}
2227				ofs = -val
2228			}
2229
2230			return str, ofs
2231		}
2232
2233		// parse integer position, 1-based coordinate must be greater than 0
2234		parseInteger := func(str string, mustBePositive bool) int {
2235			if str == "" {
2236				return 0
2237			}
2238
2239			val, err := strconv.Atoi(str)
2240			if err != nil {
2241				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized range component %s[%s:]\n", item, str)
2242				os.Exit(1)
2243			}
2244			if mustBePositive {
2245				if val < 1 {
2246					fmt.Fprintf(os.Stderr, "\nERROR: Range component %s[%s:] must be positive\n", item, str)
2247					os.Exit(1)
2248				}
2249			} else {
2250				if val == 0 {
2251					fmt.Fprintf(os.Stderr, "\nERROR: Range component %s[%s:] must not be zero\n", item, str)
2252					os.Exit(1)
2253				}
2254			}
2255
2256			return val
2257		}
2258
2259		if lft != "" {
2260			if lft[0] == '&' {
2261				lft = lft[1:]
2262				strL, intL = parseOffset(lft)
2263				typL = VARIABLERANGE
2264			} else {
2265				intL = parseInteger(lft, true)
2266				typL = INTEGERRANGE
2267			}
2268		}
2269
2270		if rgt != "" {
2271			if rgt[0] == '&' {
2272				rgt = rgt[1:]
2273				strR, intR = parseOffset(rgt)
2274				typR = VARIABLERANGE
2275			} else {
2276				intR = parseInteger(rgt, false)
2277				typR = INTEGERRANGE
2278			}
2279		}
2280
2281		// return statement required to return named variables
2282		return
2283	}
2284
2285	parseConditionals := func(cmds *Block, arguments []string) []*Operation {
2286
2287		max := len(arguments)
2288		if max < 1 {
2289			return nil
2290		}
2291
2292		// check for missing condition command
2293		txt := arguments[0]
2294		if txt != "-if" && txt != "-unless" && txt != "-select" && txt != "-match" && txt != "-avoid" && txt != "-position" {
2295			fmt.Fprintf(os.Stderr, "\nERROR: Missing -if command before '%s'\n", txt)
2296			os.Exit(1)
2297		}
2298		if txt == "-position" && max > 2 {
2299			fmt.Fprintf(os.Stderr, "\nERROR: Cannot combine -position with -if or -unless commands\n")
2300			os.Exit(1)
2301		}
2302		// check for missing argument after last condition
2303		txt = arguments[max-1]
2304		if len(txt) > 0 && txt[0] == '-' {
2305			fmt.Fprintf(os.Stderr, "\nERROR: Item missing after %s command\n", txt)
2306			os.Exit(1)
2307		}
2308
2309		cond := make([]*Operation, 0, max)
2310
2311		// parse conditional clause into execution step
2312		parseStep := func(op *Operation, elementColonValue bool) {
2313
2314			if op == nil {
2315				return
2316			}
2317
2318			str := op.Value
2319
2320			status := ELEMENT
2321
2322			// isolate and parse optional [min:max], [&VAR:&VAR], or [after|before] range specification
2323			str, rnge := eutils.SplitInTwoLeft(str, "[")
2324
2325			str = strings.TrimSpace(str)
2326			rnge = strings.TrimSpace(rnge)
2327
2328			if str == "" && rnge != "" {
2329				fmt.Fprintf(os.Stderr, "\nERROR: Variable missing in range specification [%s\n", rnge)
2330				os.Exit(1)
2331			}
2332
2333			typL, strL, intL, typR, strR, intR := parseRange(str, rnge)
2334
2335			// check for pound, percent, or caret character at beginning of name
2336			if len(str) > 1 {
2337				switch str[0] {
2338				case '&':
2339					if isAllCapsOrDigits(str[1:]) {
2340						status = VARIABLE
2341						str = str[1:]
2342					} else if strings.Contains(str, ":") {
2343						fmt.Fprintf(os.Stderr, "\nERROR: Unsupported construct '%s', use -if &VARIABLE -equals VALUE instead\n", str)
2344						os.Exit(1)
2345					} else {
2346						fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '%s'\n", str)
2347						os.Exit(1)
2348					}
2349				case '#':
2350					status = COUNT
2351					str = str[1:]
2352				case '%':
2353					status = LENGTH
2354					str = str[1:]
2355				case '^':
2356					status = DEPTH
2357					str = str[1:]
2358				default:
2359				}
2360			} else if str == "+" {
2361				status = INDEX
2362			}
2363
2364			// parse parent/element@attribute construct
2365			// colon indicates a namespace prefix in any or all of the components
2366			prnt, match := eutils.SplitInTwoRight(str, "/")
2367			match, attrib := eutils.SplitInTwoLeft(match, "@")
2368			val := ""
2369
2370			// leading colon indicates namespace prefix wildcard
2371			wildcard := false
2372			if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") {
2373				wildcard = true
2374			}
2375
2376			if elementColonValue {
2377
2378				// allow parent/element@attribute:value construct for deprecated -match and -avoid, and for subsequent -and and -or commands
2379				match, val = eutils.SplitInTwoLeft(str, ":")
2380				prnt, match = eutils.SplitInTwoRight(match, "/")
2381				match, attrib = eutils.SplitInTwoLeft(match, "@")
2382			}
2383
2384			norm := true
2385			if rnge != "" {
2386				if typL != NORANGE || typR != NORANGE || strL != "" || strR != "" || intL != 0 || intR != 0 {
2387					norm = false
2388				}
2389			}
2390
2391			tsk := &Step{Type: status, Value: str, Parent: prnt, Match: match, Attrib: attrib,
2392				TypL: typL, StrL: strL, IntL: intL, TypR: typR, StrR: strR, IntR: intR,
2393				Norm: norm, Wild: wildcard}
2394
2395			op.Stages = append(op.Stages, tsk)
2396
2397			// transform old -match "element:value" to -match element -equals value
2398			if val != "" {
2399				tsk := &Step{Type: EQUALS, Value: val}
2400				op.Stages = append(op.Stages, tsk)
2401			}
2402		}
2403
2404		idx := 0
2405
2406		// conditionals should alternate between command and object/value
2407		expectDash := true
2408		last := ""
2409
2410		var op *Operation
2411
2412		// flag to allow element-colon-value for deprecated -match and -avoid commands, otherwise colon is for namespace prefixes
2413		elementColonValue := false
2414
2415		status := UNSET
2416
2417		// parse command strings into operation structure
2418		for idx < max {
2419			str := arguments[idx]
2420			idx++
2421
2422			// conditionals should alternate between command and object/value
2423			if expectDash {
2424				if len(str) < 1 || str[0] != '-' {
2425					fmt.Fprintf(os.Stderr, "\nERROR: Unexpected '%s' argument after '%s'\n", str, last)
2426					os.Exit(1)
2427				}
2428				expectDash = false
2429			} else {
2430				if len(str) > 0 && str[0] == '-' {
2431					fmt.Fprintf(os.Stderr, "\nERROR: Unexpected '%s' command after '%s'\n", str, last)
2432					os.Exit(1)
2433				}
2434				expectDash = true
2435			}
2436			last = str
2437
2438			switch status {
2439			case UNSET:
2440				status = parseFlag(str)
2441			case POSITION:
2442				if cmds.Position != "" {
2443					fmt.Fprintf(os.Stderr, "\nERROR: -position '%s' conflicts with existing '%s'\n", str, cmds.Position)
2444					os.Exit(1)
2445				}
2446				cmds.Position = str
2447				status = UNSET
2448			case MATCH, AVOID:
2449				elementColonValue = true
2450				fallthrough
2451			case SELECT, IF, UNLESS, AND, OR:
2452				op = &Operation{Type: status, Value: str}
2453				cond = append(cond, op)
2454				parseStep(op, elementColonValue)
2455				status = UNSET
2456			case EQUALS, CONTAINS, ISWITHIN, STARTSWITH, ENDSWITH, ISNOT, ISBEFORE, ISAFTER:
2457				if op != nil {
2458					if len(str) > 1 && str[0] == '\\' {
2459						// first character may be backslash protecting dash (undocumented)
2460						str = str[1:]
2461					}
2462					tsk := &Step{Type: status, Value: str}
2463					op.Stages = append(op.Stages, tsk)
2464					op = nil
2465				} else {
2466					fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent string match constraints\n")
2467					os.Exit(1)
2468				}
2469				status = UNSET
2470			case MATCHES:
2471				if op != nil {
2472					if len(str) > 1 && str[0] == '\\' {
2473						// first character may be backslash protecting dash (undocumented)
2474						str = str[1:]
2475					}
2476					str = eutils.RemoveCommaOrSemicolon(str)
2477					tsk := &Step{Type: status, Value: str}
2478					op.Stages = append(op.Stages, tsk)
2479					op = nil
2480				} else {
2481					fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent string match constraints\n")
2482					os.Exit(1)
2483				}
2484				status = UNSET
2485			case RESEMBLES:
2486				if op != nil {
2487					if len(str) > 1 && str[0] == '\\' {
2488						// first character may be backslash protecting dash (undocumented)
2489						str = str[1:]
2490					}
2491					str = sortStringByWords(str)
2492					tsk := &Step{Type: status, Value: str}
2493					op.Stages = append(op.Stages, tsk)
2494					op = nil
2495				} else {
2496					fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent string match constraints\n")
2497					os.Exit(1)
2498				}
2499				status = UNSET
2500			case ISEQUALTO, DIFFERSFROM:
2501				if op != nil {
2502					if len(str) < 1 {
2503						fmt.Fprintf(os.Stderr, "\nERROR: Empty conditional argument\n")
2504						os.Exit(1)
2505					}
2506					ch := str[0]
2507					// uses element as second argument
2508					orig := str
2509					if ch == '#' || ch == '%' || ch == '^' {
2510						// check for pound, percent, or caret character at beginning of element (undocumented)
2511						str = str[1:]
2512						if len(str) < 1 {
2513							fmt.Fprintf(os.Stderr, "\nERROR: Unexpected conditional constraints\n")
2514							os.Exit(1)
2515						}
2516						ch = str[0]
2517					}
2518					if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') {
2519						prnt, match := eutils.SplitInTwoRight(str, "/")
2520						match, attrib := eutils.SplitInTwoLeft(match, "@")
2521						wildcard := false
2522						if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") {
2523							wildcard = true
2524						}
2525						tsk := &Step{Type: status, Value: orig, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard}
2526						op.Stages = append(op.Stages, tsk)
2527					} else {
2528						fmt.Fprintf(os.Stderr, "\nERROR: Unexpected conditional constraints\n")
2529						os.Exit(1)
2530					}
2531					op = nil
2532				}
2533				status = UNSET
2534			case GT, GE, LT, LE, EQ, NE:
2535				if op != nil {
2536					if len(str) > 1 && str[0] == '\\' {
2537						// first character may be backslash protecting minus sign (undocumented)
2538						str = str[1:]
2539					}
2540					if len(str) < 1 {
2541						fmt.Fprintf(os.Stderr, "\nERROR: Empty numeric match constraints\n")
2542						os.Exit(1)
2543					}
2544					ch := str[0]
2545					if (ch >= '0' && ch <= '9') || ch == '-' || ch == '+' {
2546						// literal numeric constant
2547						tsk := &Step{Type: status, Value: str}
2548						op.Stages = append(op.Stages, tsk)
2549					} else {
2550						// numeric test allows element as second argument
2551						orig := str
2552						if ch == '#' || ch == '%' || ch == '^' {
2553							// check for pound, percent, or caret character at beginning of element (undocumented)
2554							str = str[1:]
2555							if len(str) < 1 {
2556								fmt.Fprintf(os.Stderr, "\nERROR: Unexpected numeric match constraints\n")
2557								os.Exit(1)
2558							}
2559							ch = str[0]
2560						}
2561						if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') {
2562							prnt, match := eutils.SplitInTwoRight(str, "/")
2563							match, attrib := eutils.SplitInTwoLeft(match, "@")
2564							wildcard := false
2565							if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") {
2566								wildcard = true
2567							}
2568							tsk := &Step{Type: status, Value: orig, Parent: prnt, Match: match, Attrib: attrib, Wild: wildcard}
2569							op.Stages = append(op.Stages, tsk)
2570						} else {
2571							fmt.Fprintf(os.Stderr, "\nERROR: Unexpected numeric match constraints\n")
2572							os.Exit(1)
2573						}
2574					}
2575					op = nil
2576				} else {
2577					fmt.Fprintf(os.Stderr, "\nERROR: Unexpected adjacent numeric match constraints\n")
2578					os.Exit(1)
2579				}
2580				status = UNSET
2581			case UNRECOGNIZED:
2582				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str)
2583				os.Exit(1)
2584			default:
2585				fmt.Fprintf(os.Stderr, "\nERROR: Unexpected argument '%s'\n", str)
2586				os.Exit(1)
2587			}
2588		}
2589
2590		return cond
2591	}
2592
2593	parseExtractions := func(cmds *Block, arguments []string) []*Operation {
2594
2595		max := len(arguments)
2596		if max < 1 {
2597			return nil
2598		}
2599
2600		// check for missing -element (or -first, etc.) command
2601		txt := arguments[0]
2602		if len(txt) < 1 || txt[0] != '-' {
2603			fmt.Fprintf(os.Stderr, "\nERROR: Missing -element command before '%s'\n", txt)
2604			os.Exit(1)
2605		}
2606		// check for missing argument after last -element (or -first, etc.) command
2607		txt = arguments[max-1]
2608		if len(txt) > 0 && txt[0] == '-' {
2609			if txt == "-rst" {
2610				fmt.Fprintf(os.Stderr, "\nERROR: Unexpected position for %s command\n", txt)
2611				os.Exit(1)
2612			} else if txt == "-clr" {
2613				// main loop runs out after trailing -clr, add another so this one will be executed
2614				arguments = append(arguments, "-clr")
2615				max++
2616			} else if max < 2 || arguments[max-2] != "-lbl" {
2617				fmt.Fprintf(os.Stderr, "\nERROR: Item missing after %s command\n", txt)
2618				os.Exit(1)
2619			}
2620		}
2621
2622		comm := make([]*Operation, 0, max)
2623
2624		// parse next argument
2625		nextStatus := func(str string) OpType {
2626
2627			status := parseFlag(str)
2628
2629			switch status {
2630			case VARIABLE:
2631				op := &Operation{Type: status, Value: str[1:]}
2632				comm = append(comm, op)
2633				status = VALUE
2634			case ACCUMULATOR:
2635				op := &Operation{Type: status, Value: str[2:]}
2636				comm = append(comm, op)
2637				status = VALUE
2638			case CLR, RST:
2639				op := &Operation{Type: status, Value: ""}
2640				comm = append(comm, op)
2641				status = UNSET
2642			case ELEMENT, FIRST, LAST, ENCODE, DECODE, PLAIN, UPPER, LOWER, CHAIN, TITLE, ORDER, YEAR, DOI, TRANSLATE, REPLACE,
2643				TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, HISTOGRAM, ACCENTED,
2644				NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT, ZEROBASED, ONEBASED, UCSCBASED,
2645				REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS:
2646			case TAB, RET, PFX, SFX, SEP, LBL, PFC, DEQ, PLG, ELG, WRP, ENC, DEF, REG, EXP, COLOR:
2647			case FWD, AWD, PKG:
2648			case UNSET:
2649				fmt.Fprintf(os.Stderr, "\nERROR: No -element before '%s'\n", str)
2650				os.Exit(1)
2651			case UNRECOGNIZED:
2652				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str)
2653				os.Exit(1)
2654			default:
2655				fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", str)
2656				os.Exit(1)
2657			}
2658
2659			return status
2660		}
2661
2662		// parse extraction clause into individual steps
2663		parseSteps := func(op *Operation, pttrn string) {
2664
2665			if op == nil {
2666				return
2667			}
2668
2669			stat := op.Type
2670			str := op.Value
2671
2672			// element names combined with commas are treated as a prefix-separator-suffix group
2673			comma := strings.Split(str, ",")
2674
2675			rnge := ""
2676			for _, item := range comma {
2677				status := stat
2678
2679				// isolate and parse optional [min:max], [&VAR:&VAR], or [after|before] range specification
2680				item, rnge = eutils.SplitInTwoLeft(item, "[")
2681
2682				item = strings.TrimSpace(item)
2683				rnge = strings.TrimSpace(rnge)
2684
2685				if item == "" && rnge != "" {
2686					fmt.Fprintf(os.Stderr, "\nERROR: Variable missing in range specification [%s\n", rnge)
2687					os.Exit(1)
2688				}
2689
2690				typL, strL, intL, typR, strR, intR := parseRange(item, rnge)
2691
2692				// check for special character at beginning of name
2693				if len(item) > 1 {
2694					switch item[0] {
2695					case '&':
2696						if isAllCapsOrDigits(item[1:]) {
2697							status = VARIABLE
2698							item = item[1:]
2699						} else {
2700							fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variable '%s'\n", item)
2701							os.Exit(1)
2702						}
2703					case '#':
2704						status = COUNT
2705						item = item[1:]
2706					case '%':
2707						status = LENGTH
2708						item = item[1:]
2709					case '^':
2710						status = DEPTH
2711						item = item[1:]
2712					case '*':
2713						for _, ch := range item {
2714							if ch != '*' {
2715								break
2716							}
2717						}
2718						status = STAR
2719					default:
2720					}
2721				} else {
2722					switch item {
2723					case "?":
2724						status = QUESTION
2725					case "*":
2726						status = STAR
2727					case "$":
2728						status = DOLLAR
2729					case "@":
2730						status = ATSIGN
2731					case "+":
2732						status = INDEX
2733					default:
2734					}
2735				}
2736
2737				// parse parent/element@attribute construct
2738				// colon indicates a namespace prefix in any or all of the components
2739				prnt, match := eutils.SplitInTwoRight(item, "/")
2740				match, attrib := eutils.SplitInTwoLeft(match, "@")
2741
2742				// leading colon indicates namespace prefix wildcard
2743				wildcard := false
2744				if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") || strings.HasPrefix(attrib, ":") {
2745					wildcard = true
2746				}
2747
2748				// sequence coordinate adjustments
2749				switch status {
2750				case ZEROBASED, ONEBASED, UCSCBASED:
2751					seq := pttrn + ":"
2752					if attrib != "" {
2753						seq += "@"
2754						seq += attrib
2755					} else if match != "" {
2756						seq += match
2757					}
2758					// confirm -0-based or -1-based arguments are known sequence position elements or attributes
2759					slock.RLock()
2760					seqtype, ok := sequenceTypeIs[seq]
2761					slock.RUnlock()
2762					if !ok {
2763						fmt.Fprintf(os.Stderr, "\nERROR: Element '%s' is not suitable for sequence coordinate conversion\n", item)
2764						os.Exit(1)
2765					}
2766					switch status {
2767					case ZEROBASED:
2768						status = ELEMENT
2769						// if 1-based coordinates, decrement to get 0-based value
2770						if seqtype.Based == 1 {
2771							status = DEC
2772						}
2773					case ONEBASED:
2774						status = ELEMENT
2775						// if 0-based coordinates, increment to get 1-based value
2776						if seqtype.Based == 0 {
2777							status = INC
2778						}
2779					case UCSCBASED:
2780						status = ELEMENT
2781						// half-open intervals, start is 0-based, stop is 1-based
2782						if seqtype.Based == 0 && seqtype.Which == ISSTOP {
2783							status = INC
2784						} else if seqtype.Based == 1 && seqtype.Which == ISSTART {
2785							status = DEC
2786						}
2787					default:
2788						status = ELEMENT
2789					}
2790				default:
2791				}
2792
2793				norm := true
2794				if rnge != "" {
2795					if typL != NORANGE || typR != NORANGE || strL != "" || strR != "" || intL != 0 || intR != 0 {
2796						norm = false
2797					}
2798				}
2799
2800				tsk := &Step{Type: status, Value: item, Parent: prnt, Match: match, Attrib: attrib,
2801					TypL: typL, StrL: strL, IntL: intL, TypR: typR, StrR: strR, IntR: intR,
2802					Norm: norm, Wild: wildcard}
2803
2804				op.Stages = append(op.Stages, tsk)
2805			}
2806		}
2807
2808		idx := 0
2809
2810		status := UNSET
2811
2812		// parse command strings into operation structure
2813		for idx < max {
2814			str := arguments[idx]
2815			idx++
2816
2817			if argTypeIs[str] == CONDITIONAL {
2818				fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", str)
2819				os.Exit(1)
2820			}
2821
2822			switch status {
2823			case UNSET:
2824				status = nextStatus(str)
2825			case ELEMENT, FIRST, LAST, ENCODE, DECODE, PLAIN, UPPER, LOWER, CHAIN, TITLE, ORDER, YEAR, DOI, TRANSLATE, REPLACE,
2826				TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, HISTOGRAM, ACCENTED,
2827				NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT, ZEROBASED, ONEBASED, UCSCBASED,
2828				REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS:
2829				for !strings.HasPrefix(str, "-") {
2830					// create one operation per argument, even if under a single -element statement
2831					op := &Operation{Type: status, Value: str}
2832					comm = append(comm, op)
2833					parseSteps(op, pttrn)
2834					if idx >= max {
2835						break
2836					}
2837					str = arguments[idx]
2838					idx++
2839				}
2840				status = UNSET
2841				if idx < max {
2842					status = nextStatus(str)
2843				}
2844			case TAB, RET, PFX, SFX, SEP, LBL, PFC, DEQ, PLG, ELG, WRP, ENC, DEF, REG, EXP, COLOR:
2845				op := &Operation{Type: status, Value: eutils.ConvertSlash(str)}
2846				comm = append(comm, op)
2847				status = UNSET
2848			case FWD:
2849				cmds.Foreword = eutils.ConvertSlash(str)
2850				status = UNSET
2851			case AWD:
2852				cmds.Afterword = eutils.ConvertSlash(str)
2853				status = UNSET
2854			case PKG:
2855				pkg := eutils.ConvertSlash(str)
2856				cmds.Foreword = ""
2857				cmds.Afterword = ""
2858				if pkg != "" && pkg != "-" {
2859					items := strings.Split(pkg, "/")
2860					for i := 0; i < len(items); i++ {
2861						cmds.Foreword += "<" + items[i] + ">"
2862					}
2863					for i := len(items) - 1; i >= 0; i-- {
2864						cmds.Afterword += "</" + items[i] + ">"
2865					}
2866				}
2867				status = UNSET
2868			case VARIABLE:
2869				op := &Operation{Type: status, Value: str[1:]}
2870				comm = append(comm, op)
2871				status = VALUE
2872			case ACCUMULATOR:
2873				op := &Operation{Type: status, Value: str[2:]}
2874				comm = append(comm, op)
2875				status = VALUE
2876			case VALUE:
2877				op := &Operation{Type: status, Value: str}
2878				comm = append(comm, op)
2879				parseSteps(op, pttrn)
2880				status = UNSET
2881			case UNRECOGNIZED:
2882				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized argument '%s'\n", str)
2883				os.Exit(1)
2884			default:
2885			}
2886		}
2887
2888		return comm
2889	}
2890
2891	// parseOperations recursive definition
2892	var parseOperations func(parent *Block)
2893
2894	// parseOperations converts parsed arguments to operations lists
2895	parseOperations = func(parent *Block) {
2896
2897		args := parent.Parsed
2898
2899		partition := 0
2900		for cur, str := range args {
2901
2902			// record junction between conditional and extraction commands
2903			partition = cur + 1
2904
2905			// skip if not a command
2906			if len(str) < 1 || str[0] != '-' {
2907				continue
2908			}
2909
2910			if argTypeIs[str] != CONDITIONAL {
2911				partition = cur
2912				break
2913			}
2914		}
2915
2916		// split arguments into conditional tests and extraction or customization commands
2917		conditionals := args[0:partition]
2918		args = args[partition:]
2919
2920		partition = 0
2921		foundElse := false
2922		for cur, str := range args {
2923
2924			// record junction at -else command
2925			partition = cur + 1
2926
2927			// skip if not a command
2928			if len(str) < 1 || str[0] != '-' {
2929				continue
2930			}
2931
2932			if str == "-else" {
2933				partition = cur
2934				foundElse = true
2935				break
2936			}
2937		}
2938
2939		extractions := args[0:partition]
2940		alternative := args[partition:]
2941
2942		if len(alternative) > 0 && alternative[0] == "-else" {
2943			alternative = alternative[1:]
2944		}
2945
2946		// validate argument structure and convert to operations lists
2947		parent.Conditions = parseConditionals(parent, conditionals)
2948		parent.Commands = parseExtractions(parent, extractions)
2949		parent.Failure = parseExtractions(parent, alternative)
2950
2951		// reality checks on placement of -else command
2952		if foundElse {
2953			if len(conditionals) < 1 {
2954				fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n")
2955				os.Exit(1)
2956			}
2957			if len(alternative) < 1 {
2958				fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n")
2959				os.Exit(1)
2960			}
2961			if len(parent.Subtasks) > 0 {
2962				fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -else command\n")
2963				os.Exit(1)
2964			}
2965		}
2966
2967		for _, sub := range parent.Subtasks {
2968			parseOperations(sub)
2969		}
2970	}
2971
2972	// parseArguments
2973
2974	head := &Block{}
2975
2976	for _, txt := range cmdargs {
2977		head.Working = append(head.Working, txt)
2978	}
2979
2980	// initial parsing of exploration command structure
2981	parseCommands(head, PATTERN)
2982
2983	if len(head.Subtasks) != 1 {
2984		return nil
2985	}
2986
2987	// skip past empty placeholder
2988	head = head.Subtasks[0]
2989
2990	// convert command strings to array of operations for faster processing
2991	parseOperations(head)
2992
2993	// check for no -element or multiple -pattern commands
2994	noElement := true
2995	numPatterns := 0
2996	for _, txt := range cmdargs {
2997		if argTypeIs[txt] == EXTRACTION {
2998			noElement = false
2999		}
3000		if txt == "-pattern" || txt == "-Pattern" {
3001			numPatterns++
3002		} else if txt == "-select" {
3003			noElement = false
3004			head.Position = "select"
3005		}
3006	}
3007
3008	if numPatterns < 1 {
3009		fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n")
3010		os.Exit(1)
3011	}
3012
3013	if numPatterns > 1 {
3014		fmt.Fprintf(os.Stderr, "\nERROR: Only one -pattern command is permitted\n")
3015		os.Exit(1)
3016	}
3017
3018	if noElement {
3019		fmt.Fprintf(os.Stderr, "\nERROR: No -element statement in argument list\n")
3020		os.Exit(1)
3021	}
3022
3023	return head
3024}
3025
3026// printSubtree supports compression styles selected by -element "*" through "****"
3027func printSubtree(node *eutils.XMLNode, style IndentType, printAttrs bool, proc func(string)) {
3028
3029	if node == nil || proc == nil {
3030		return
3031	}
3032
3033	// WRAPPED is SUBTREE plus each attribute on its own line
3034	wrapped := false
3035	if style == WRAPPED {
3036		style = SUBTREE
3037		wrapped = true
3038	}
3039
3040	// INDENT is offset by two spaces to allow for parent tag, SUBTREE is not offset
3041	initial := 1
3042	if style == SUBTREE {
3043		style = INDENT
3044		initial = 0
3045	}
3046
3047	// array to speed up indentation
3048	indentSpaces := []string{
3049		"",
3050		"  ",
3051		"    ",
3052		"      ",
3053		"        ",
3054		"          ",
3055		"            ",
3056		"              ",
3057		"                ",
3058		"                  ",
3059	}
3060
3061	// indent a specified number of spaces
3062	doIndent := func(indt int) {
3063		i := indt
3064		for i > 9 {
3065			proc("                    ")
3066			i -= 10
3067		}
3068		if i < 0 {
3069			return
3070		}
3071		proc(indentSpaces[i])
3072	}
3073
3074	// doSubtree recursive definition
3075	var doSubtree func(*eutils.XMLNode, int)
3076
3077	doSubtree = func(curr *eutils.XMLNode, depth int) {
3078
3079		// suppress if it would be an empty self-closing tag
3080		if !eutils.IsNotJustWhitespace(curr.Attributes) && curr.Contents == "" && curr.Children == nil {
3081			return
3082		}
3083
3084		if style == INDENT {
3085			doIndent(depth)
3086		}
3087
3088		if curr.Name != "" {
3089			proc("<")
3090			proc(curr.Name)
3091
3092			if printAttrs {
3093
3094				attr := strings.TrimSpace(curr.Attributes)
3095				attr = eutils.CompressRunsOfSpaces(attr)
3096
3097				if attr != "" {
3098
3099					if wrapped {
3100
3101						start := 0
3102						idx := 0
3103
3104						attlen := len(attr)
3105
3106						for idx < attlen {
3107							ch := attr[idx]
3108							if ch == '=' {
3109								str := attr[start:idx]
3110								proc("\n")
3111								doIndent(depth)
3112								proc(" ")
3113								proc(str)
3114								// skip past equal sign and leading double quote
3115								idx += 2
3116								start = idx
3117							} else if ch == '"' || ch == '\'' {
3118								str := attr[start:idx]
3119								proc("=\"")
3120								proc(str)
3121								proc("\"")
3122								// skip past trailing double quote and (possible) space
3123								idx += 2
3124								start = idx
3125							} else {
3126								idx++
3127							}
3128						}
3129
3130						proc("\n")
3131						doIndent(depth)
3132
3133					} else {
3134
3135						proc(" ")
3136						proc(attr)
3137					}
3138				}
3139			}
3140
3141			// see if suitable for for self-closing tag
3142			if curr.Contents == "" && curr.Children == nil {
3143				proc("/>")
3144				if style != COMPACT {
3145					proc("\n")
3146				}
3147				return
3148			}
3149
3150			proc(">")
3151		}
3152
3153		if curr.Contents != "" {
3154
3155			proc(curr.Contents[:])
3156
3157		} else {
3158
3159			if style != COMPACT {
3160				proc("\n")
3161			}
3162
3163			for chld := curr.Children; chld != nil; chld = chld.Next {
3164				doSubtree(chld, depth+1)
3165			}
3166
3167			if style == INDENT {
3168				i := depth
3169				for i > 9 {
3170					proc("                    ")
3171					i -= 10
3172				}
3173				proc(indentSpaces[i])
3174			}
3175		}
3176
3177		if curr.Name != "" {
3178			proc("<")
3179			proc("/")
3180			proc(curr.Name)
3181			proc(">")
3182		}
3183
3184		if style != COMPACT {
3185			proc("\n")
3186		}
3187	}
3188
3189	doSubtree(node, initial)
3190}
3191
3192var (
3193	xlock sync.Mutex
3194	replx map[string]*regexp.Regexp
3195)
3196
3197// processClause handles comma-separated -element arguments
3198func processClause(curr *eutils.XMLNode, stages []*Step, mask, prev, pfx, sfx, plg, sep, def, reg, exp string, wrp bool, status OpType, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int) (string, bool) {
3199
3200	if curr == nil || stages == nil {
3201		return "", false
3202	}
3203
3204	if replx == nil {
3205		xlock.Lock()
3206		if replx == nil {
3207			replx = make(map[string]*regexp.Regexp)
3208		}
3209		xlock.Unlock()
3210	}
3211
3212	// processElement handles individual -element constructs
3213	processElement := func(acc func(string)) {
3214
3215		if acc == nil {
3216			return
3217		}
3218
3219		// element names combined with commas are treated as a prefix-separator-suffix group
3220		for _, stage := range stages {
3221
3222			stat := stage.Type
3223			item := stage.Value
3224			prnt := stage.Parent
3225			match := stage.Match
3226			attrib := stage.Attrib
3227			typL := stage.TypL
3228			strL := stage.StrL
3229			intL := stage.IntL
3230			typR := stage.TypR
3231			strR := stage.StrR
3232			intR := stage.IntR
3233			norm := stage.Norm
3234			wildcard := stage.Wild
3235			unescape := (stat != INDICES)
3236
3237			// exploreElements is a wrapper for eutils.ExploreElements, obtaining most arguments as closures
3238			exploreElements := func(proc func(string, int)) {
3239				eutils.ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc)
3240			}
3241
3242			// sendSlice applies optional [min:max] range restriction and sends result to accumulator
3243			sendSlice := func(str string) {
3244
3245				// handle usual situation with no range first
3246				if norm {
3247					if wrp {
3248						str = html.EscapeString(str)
3249					}
3250					acc(str)
3251					return
3252				}
3253
3254				// check for [after|before] variant
3255				if typL == STRINGRANGE || typR == STRINGRANGE {
3256					if strL != "" {
3257						// use case-insensitive test
3258						strL = strings.ToUpper(strL)
3259						idx := strings.Index(strings.ToUpper(str), strL)
3260						if idx < 0 {
3261							// specified substring must be present in original string
3262							return
3263						}
3264						ln := len(strL)
3265						// remove leading text
3266						str = str[idx+ln:]
3267					}
3268					if strR != "" {
3269						strR = strings.ToUpper(strR)
3270						idx := strings.Index(strings.ToUpper(str), strR)
3271						if idx < 0 {
3272							// specified substring must be present in remaining string
3273							return
3274						}
3275						// remove trailing text
3276						str = str[:idx]
3277					}
3278					if str != "" {
3279						if wrp {
3280							str = html.EscapeString(str)
3281						}
3282						acc(str)
3283					}
3284					return
3285				}
3286
3287				min := 0
3288				max := 0
3289
3290				// slice arguments use variable value +- adjustment or integer constant
3291				if typL == VARIABLERANGE {
3292					if strL == "" {
3293						return
3294					}
3295					lft, ok := variables[strL]
3296					if !ok {
3297						return
3298					}
3299					val, err := strconv.Atoi(lft)
3300					if err != nil {
3301						return
3302					}
3303					// range argument values are inclusive and 1-based, decrement variable start +- offset to use in slice
3304					min = val + intL - 1
3305				} else if typL == INTEGERRANGE {
3306					// range argument values are inclusive and 1-based, decrement literal start to use in slice
3307					min = intL - 1
3308				}
3309				if typR == VARIABLERANGE {
3310					if strR == "" {
3311						return
3312					}
3313					rgt, ok := variables[strR]
3314					if !ok {
3315						return
3316					}
3317					val, err := strconv.Atoi(rgt)
3318					if err != nil {
3319						return
3320					}
3321					if val+intR < 0 {
3322						// negative value is 1-based inset from end of string (undocumented)
3323						max = len(str) + val + intR + 1
3324					} else {
3325						max = val + intR
3326					}
3327				} else if typR == INTEGERRANGE {
3328					if intR < 0 {
3329						// negative max is inset from end of string (undocumented)
3330						max = len(str) + intR + 1
3331					} else {
3332						max = intR
3333					}
3334				}
3335
3336				doRevComp := false
3337				doUpCase := false
3338				if status == NUCLEIC {
3339					// -nucleic uses direction of range to decide between forward strand or reverse complement
3340					if min+1 > max {
3341						min, max = max-1, min+1
3342						doRevComp = true
3343					}
3344					doUpCase = true
3345				}
3346
3347				// numeric range now calculated, apply slice to string
3348				if min == 0 && max == 0 {
3349					if doRevComp {
3350						str = eutils.ReverseComplement(str)
3351					}
3352					if doUpCase {
3353						str = strings.ToUpper(str)
3354					}
3355					if wrp {
3356						str = html.EscapeString(str)
3357					}
3358					acc(str)
3359				} else if max == 0 {
3360					if min > 0 && min < len(str) {
3361						str = str[min:]
3362						if str != "" {
3363							if doRevComp {
3364								str = eutils.ReverseComplement(str)
3365							}
3366							if doUpCase {
3367								str = strings.ToUpper(str)
3368							}
3369							if wrp {
3370								str = html.EscapeString(str)
3371							}
3372							acc(str)
3373						}
3374					}
3375				} else if min == 0 {
3376					if max > 0 && max <= len(str) {
3377						str = str[:max]
3378						if str != "" {
3379							if doRevComp {
3380								str = eutils.ReverseComplement(str)
3381							}
3382							if doUpCase {
3383								str = strings.ToUpper(str)
3384							}
3385							if wrp {
3386								str = html.EscapeString(str)
3387							}
3388							acc(str)
3389						}
3390					}
3391				} else {
3392					if min < max && min > 0 && max <= len(str) {
3393						str = str[min:max]
3394						if str != "" {
3395							if doRevComp {
3396								str = eutils.ReverseComplement(str)
3397							}
3398							if doUpCase {
3399								str = strings.ToUpper(str)
3400							}
3401							if wrp {
3402								str = html.EscapeString(str)
3403							}
3404							acc(str)
3405						}
3406					}
3407				}
3408			}
3409
3410			switch stat {
3411			case ELEMENT:
3412				exploreElements(func(str string, lvl int) {
3413					if str != "" {
3414						sendSlice(str)
3415					}
3416				})
3417			case TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, HISTOGRAM, ACCENTED,
3418				VALUE, LEN, SUM, MIN, MAX, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT,
3419				REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS:
3420				exploreElements(func(str string, lvl int) {
3421					if str != "" {
3422						sendSlice(str)
3423					}
3424				})
3425			case FIRST:
3426				single := ""
3427
3428				exploreElements(func(str string, lvl int) {
3429					if single == "" {
3430						single = str
3431					}
3432				})
3433
3434				if single != "" {
3435					sendSlice(single)
3436				}
3437			case LAST:
3438				single := ""
3439
3440				exploreElements(func(str string, lvl int) {
3441					single = str
3442				})
3443
3444				if single != "" {
3445					sendSlice(single)
3446				}
3447			case ENCODE:
3448				exploreElements(func(str string, lvl int) {
3449					if str != "" {
3450						if !wrp {
3451							str = html.EscapeString(str)
3452						}
3453						sendSlice(str)
3454					}
3455				})
3456			case DECODE:
3457				// superseded by transmute -decode64 (undocumented)
3458				exploreElements(func(str string, lvl int) {
3459					if str != "" {
3460						txt, err := base64.StdEncoding.DecodeString(str)
3461						if err == nil {
3462							sendSlice(string(txt))
3463						}
3464					}
3465				})
3466			case PLAIN:
3467				exploreElements(func(str string, lvl int) {
3468					if str != "" {
3469						if eutils.IsNotASCII(str) {
3470							str = eutils.DoAccentTransform(str)
3471							if eutils.HasUnicodeMarkup(str) {
3472								str = eutils.RepairUnicodeMarkup(str, eutils.SPACE)
3473							}
3474						}
3475						if eutils.HasBadSpace(str) {
3476							str = eutils.CleanupBadSpaces(str)
3477						}
3478						if eutils.HasAngleBracket(str) {
3479							str = eutils.RepairTableMarkup(str, eutils.SPACE)
3480							str = eutils.RemoveEmbeddedMarkup(str)
3481							str = eutils.CompressRunsOfSpaces(str)
3482						}
3483						sendSlice(str)
3484					}
3485				})
3486			case UPPER:
3487				exploreElements(func(str string, lvl int) {
3488					if str != "" {
3489						str = strings.ToUpper(str)
3490						sendSlice(str)
3491					}
3492				})
3493			case LOWER:
3494				exploreElements(func(str string, lvl int) {
3495					if str != "" {
3496						str = strings.ToLower(str)
3497						sendSlice(str)
3498					}
3499				})
3500			case CHAIN:
3501				exploreElements(func(str string, lvl int) {
3502					if str != "" {
3503						str = strings.Replace(str, " ", "_", -1)
3504						sendSlice(str)
3505					}
3506				})
3507			case TITLE:
3508				exploreElements(func(str string, lvl int) {
3509					if str != "" {
3510						str = strings.ToLower(str)
3511						str = strings.Title(str)
3512						sendSlice(str)
3513					}
3514				})
3515			case ORDER:
3516				exploreElements(func(str string, lvl int) {
3517					if str != "" {
3518						str = sortStringByWords(str)
3519						sendSlice(str)
3520					}
3521				})
3522			case YEAR:
3523				exploreElements(func(str string, lvl int) {
3524					if str != "" {
3525						words := strings.FieldsFunc(str, func(c rune) bool {
3526							return !unicode.IsDigit(c)
3527						})
3528						for _, item := range words {
3529							if len(item) == 4 {
3530								sendSlice(item)
3531								// only print first year, e.g., PubDate/MedlineDate "2008 Dec-2009 Jan"
3532								return
3533							}
3534						}
3535					}
3536				})
3537			case DOI:
3538				exploreElements(func(str string, lvl int) {
3539					if str != "" {
3540						str = strings.TrimPrefix(str, "doi:")
3541						str = strings.TrimSpace(str)
3542						str = strings.TrimPrefix(str, "/")
3543						str = strings.TrimPrefix(str, "https://doi.org/")
3544						str = strings.TrimPrefix(str, "http://dx.doi.org/")
3545						str := url.QueryEscape(str)
3546						sendSlice("https://doi.org/" + str)
3547					}
3548				})
3549			case TRANSLATE:
3550				exploreElements(func(str string, lvl int) {
3551					if str != "" {
3552						txt, found := transform[str]
3553						if found {
3554							// require successful mapping
3555							sendSlice(txt)
3556						}
3557					}
3558				})
3559			case REPLACE:
3560				exploreElements(func(str string, lvl int) {
3561					if str != "" && replx != nil {
3562						re, found := replx[str]
3563						if !found {
3564							xlock.Lock()
3565							re, found = replx[str]
3566							if !found {
3567								nw, err := regexp.Compile(reg)
3568								if err == nil {
3569									replx[str] = nw
3570									re = nw
3571								}
3572							}
3573							xlock.Unlock()
3574						}
3575						if re != nil {
3576							txt := re.ReplaceAllString(str, exp)
3577							if txt != "" {
3578								sendSlice(txt)
3579							}
3580						}
3581					}
3582				})
3583			case VARIABLE, ACCUMULATOR:
3584				// use value of stored variable
3585				val, ok := variables[match]
3586				if ok {
3587					sendSlice(val)
3588				}
3589			case NUM, COUNT:
3590				count := 0
3591
3592				exploreElements(func(str string, lvl int) {
3593					count++
3594				})
3595
3596				// number of element objects
3597				val := strconv.Itoa(count)
3598				acc(val)
3599			case LENGTH:
3600				length := 0
3601
3602				exploreElements(func(str string, lvl int) {
3603					length += len(str)
3604				})
3605
3606				// length of element strings
3607				val := strconv.Itoa(length)
3608				acc(val)
3609			case DEPTH:
3610				exploreElements(func(str string, lvl int) {
3611					// depth of each element in scope
3612					val := strconv.Itoa(lvl)
3613					acc(val)
3614				})
3615			case INDEX:
3616				// -element "+" prints index of current XML object
3617				val := strconv.Itoa(index)
3618				acc(val)
3619			case INC:
3620				// -inc, or component of -0-based, -1-based, or -ucsc-based
3621				exploreElements(func(str string, lvl int) {
3622					if str != "" {
3623						num, err := strconv.Atoi(str)
3624						if err == nil {
3625							// increment value
3626							num++
3627							val := strconv.Itoa(num)
3628							acc(val)
3629						}
3630					}
3631				})
3632			case DEC:
3633				// -dec, or component of -0-based, -1-based, or -ucsc-based
3634				exploreElements(func(str string, lvl int) {
3635					if str != "" {
3636						num, err := strconv.Atoi(str)
3637						if err == nil {
3638							// decrement value
3639							num--
3640							val := strconv.Itoa(num)
3641							acc(val)
3642						}
3643					}
3644				})
3645			case QUESTION:
3646				acc(curr.Name)
3647			case STAR:
3648				// -element "*" prints current XML subtree on a single line
3649				style := SINGULARITY
3650				printAttrs := true
3651
3652				for _, ch := range item {
3653					if ch == '*' {
3654						style++
3655					} else if ch == '@' {
3656						printAttrs = false
3657					}
3658				}
3659				if style > WRAPPED {
3660					style = WRAPPED
3661				}
3662				if style < COMPACT {
3663					style = COMPACT
3664				}
3665
3666				var buffer strings.Builder
3667
3668				printSubtree(curr, style, printAttrs,
3669					func(str string) {
3670						if str != "" {
3671							buffer.WriteString(str)
3672						}
3673					})
3674
3675				txt := buffer.String()
3676				if txt != "" {
3677					acc(txt)
3678				}
3679			case DOLLAR:
3680				for chld := curr.Children; chld != nil; chld = chld.Next {
3681					acc(chld.Name)
3682				}
3683			case ATSIGN:
3684				if curr.Attributes != "" && curr.Attribs == nil {
3685					curr.Attribs = eutils.ParseAttributes(curr.Attributes)
3686				}
3687				for i := 0; i < len(curr.Attribs)-1; i += 2 {
3688					acc(curr.Attribs[i])
3689				}
3690			default:
3691			}
3692		}
3693	}
3694
3695	ok := false
3696
3697	// format results in buffer
3698	var buffer strings.Builder
3699
3700	buffer.WriteString(prev)
3701	buffer.WriteString(plg)
3702	buffer.WriteString(pfx)
3703	between := ""
3704
3705	switch status {
3706	case ELEMENT:
3707		processElement(func(str string) {
3708			if str != "" {
3709				ok = true
3710				buffer.WriteString(between)
3711				buffer.WriteString(str)
3712				between = sep
3713			}
3714		})
3715	case FIRST:
3716		single := ""
3717
3718		processElement(func(str string) {
3719			ok = true
3720			if single == "" {
3721				single = str
3722			}
3723		})
3724
3725		if single != "" {
3726			buffer.WriteString(between)
3727			buffer.WriteString(single)
3728			between = sep
3729		}
3730	case LAST:
3731		single := ""
3732
3733		processElement(func(str string) {
3734			ok = true
3735			single = str
3736		})
3737
3738		if single != "" {
3739			buffer.WriteString(between)
3740			buffer.WriteString(single)
3741			between = sep
3742		}
3743	case ENCODE, DECODE, PLAIN, UPPER, LOWER, CHAIN, TITLE, ORDER, YEAR, DOI, TRANSLATE,
3744		REPLACE, VALUE, NUM, INC, DEC, ZEROBASED, ONEBASED, UCSCBASED, NUCLEIC:
3745		processElement(func(str string) {
3746			if str != "" {
3747				ok = true
3748				buffer.WriteString(between)
3749				buffer.WriteString(str)
3750				between = sep
3751			}
3752		})
3753	case LEN:
3754		length := 0
3755
3756		processElement(func(str string) {
3757			length += len(str)
3758			ok = true
3759		})
3760
3761		if ok {
3762			// length of element strings
3763			val := strconv.Itoa(length)
3764			buffer.WriteString(between)
3765			buffer.WriteString(val)
3766			between = sep
3767		}
3768	case SUM:
3769		sum := 0
3770
3771		processElement(func(str string) {
3772			value, err := strconv.Atoi(str)
3773			if err == nil {
3774				sum += value
3775				ok = true
3776			}
3777		})
3778
3779		if ok {
3780			// sum of element values
3781			val := strconv.Itoa(sum)
3782			buffer.WriteString(between)
3783			buffer.WriteString(val)
3784			between = sep
3785		}
3786	case MIN:
3787		min := 0
3788
3789		processElement(func(str string) {
3790			value, err := strconv.Atoi(str)
3791			if err == nil {
3792				if !ok || value < min {
3793					min = value
3794				}
3795				ok = true
3796			}
3797		})
3798
3799		if ok {
3800			// minimum of element values
3801			val := strconv.Itoa(min)
3802			buffer.WriteString(between)
3803			buffer.WriteString(val)
3804			between = sep
3805		}
3806	case MAX:
3807		max := 0
3808
3809		processElement(func(str string) {
3810			value, err := strconv.Atoi(str)
3811			if err == nil {
3812				if !ok || value > max {
3813					max = value
3814				}
3815				ok = true
3816			}
3817		})
3818
3819		if ok {
3820			// maximum of element values
3821			val := strconv.Itoa(max)
3822			buffer.WriteString(between)
3823			buffer.WriteString(val)
3824			between = sep
3825		}
3826	case SUB:
3827		first := 0
3828		second := 0
3829		count := 0
3830
3831		processElement(func(str string) {
3832			value, err := strconv.Atoi(str)
3833			if err == nil {
3834				count++
3835				if count == 1 {
3836					first = value
3837				} else if count == 2 {
3838					second = value
3839				}
3840			}
3841		})
3842
3843		if count == 2 {
3844			// must have exactly 2 elements
3845			ok = true
3846			// difference of element values
3847			val := strconv.Itoa(first - second)
3848			buffer.WriteString(between)
3849			buffer.WriteString(val)
3850			between = sep
3851		}
3852	case AVG:
3853		sum := 0
3854		count := 0
3855
3856		processElement(func(str string) {
3857			value, err := strconv.Atoi(str)
3858			if err == nil {
3859				sum += value
3860				count++
3861				ok = true
3862			}
3863		})
3864
3865		if ok {
3866			// average of element values
3867			avg := int(float64(sum) / float64(count))
3868			val := strconv.Itoa(avg)
3869			buffer.WriteString(between)
3870			buffer.WriteString(val)
3871			between = sep
3872		}
3873	case DEV:
3874		count := 0
3875		mean := 0.0
3876		m2 := 0.0
3877
3878		processElement(func(str string) {
3879			value, err := strconv.Atoi(str)
3880			if err == nil {
3881				// Welford algorithm for one-pass standard deviation
3882				count++
3883				x := float64(value)
3884				delta := x - mean
3885				mean += delta / float64(count)
3886				m2 += delta * (x - mean)
3887			}
3888		})
3889
3890		if count > 1 {
3891			// must have at least 2 elements
3892			ok = true
3893			// standard deviation of element values
3894			vrc := m2 / float64(count-1)
3895			dev := int(math.Sqrt(vrc))
3896			val := strconv.Itoa(dev)
3897			buffer.WriteString(between)
3898			buffer.WriteString(val)
3899			between = sep
3900		}
3901	case MED:
3902		var arry []int
3903		count := 0
3904
3905		processElement(func(str string) {
3906			value, err := strconv.Atoi(str)
3907			if err == nil {
3908				arry = append(arry, value)
3909				count++
3910				ok = true
3911			}
3912		})
3913
3914		if ok {
3915			// median of element values
3916			sort.Slice(arry, func(i, j int) bool { return arry[i] < arry[j] })
3917			med := arry[count/2]
3918			val := strconv.Itoa(med)
3919			buffer.WriteString(between)
3920			buffer.WriteString(val)
3921			between = sep
3922		}
3923	case MUL:
3924		first := 0
3925		second := 0
3926		count := 0
3927
3928		processElement(func(str string) {
3929			value, err := strconv.Atoi(str)
3930			if err == nil {
3931				count++
3932				if count == 1 {
3933					first = value
3934				} else if count == 2 {
3935					second = value
3936				}
3937			}
3938		})
3939
3940		if count == 2 {
3941			// must have exactly 2 elements
3942			ok = true
3943			// product of element values
3944			val := strconv.Itoa(first * second)
3945			buffer.WriteString(between)
3946			buffer.WriteString(val)
3947			between = sep
3948		}
3949	case DIV:
3950		first := 0
3951		second := 0
3952		count := 0
3953
3954		processElement(func(str string) {
3955			value, err := strconv.Atoi(str)
3956			if err == nil {
3957				count++
3958				if count == 1 {
3959					first = value
3960				} else if count == 2 {
3961					second = value
3962				}
3963			}
3964		})
3965
3966		if count == 2 {
3967			// must have exactly 2 elements
3968			ok = true
3969			// quotient of element values
3970			val := strconv.Itoa(first / second)
3971			buffer.WriteString(between)
3972			buffer.WriteString(val)
3973			between = sep
3974		}
3975	case MOD:
3976		first := 0
3977		second := 0
3978		count := 0
3979
3980		processElement(func(str string) {
3981			value, err := strconv.Atoi(str)
3982			if err == nil {
3983				count++
3984				if count == 1 {
3985					first = value
3986				} else if count == 2 {
3987					second = value
3988				}
3989			}
3990		})
3991
3992		if count == 2 {
3993			// must have exactly 2 elements
3994			ok = true
3995			// modulus of element values
3996			val := strconv.Itoa(first % second)
3997			buffer.WriteString(between)
3998			buffer.WriteString(val)
3999			between = sep
4000		}
4001	case BIN:
4002		processElement(func(str string) {
4003			num, err := strconv.Atoi(str)
4004			if err == nil {
4005				// convert to binary representation
4006				val := strconv.FormatInt(int64(num), 2)
4007				buffer.WriteString(between)
4008				buffer.WriteString(val)
4009				between = sep
4010				ok = true
4011			}
4012		})
4013	case BIT:
4014		processElement(func(str string) {
4015			num, err := strconv.Atoi(str)
4016			if err == nil {
4017				// Kernighan algorithm for counting set bits
4018				count := 0
4019				for num != 0 {
4020					num &= num - 1
4021					count++
4022				}
4023				val := strconv.Itoa(count)
4024				buffer.WriteString(between)
4025				buffer.WriteString(val)
4026				between = sep
4027				ok = true
4028			}
4029		})
4030	case REVCOMP:
4031		processElement(func(str string) {
4032			if str != "" {
4033				ok = true
4034				buffer.WriteString(between)
4035				str = eutils.ReverseComplement(str)
4036				buffer.WriteString(str)
4037				between = sep
4038			}
4039		})
4040	case FASTA:
4041		processElement(func(str string) {
4042			for str != "" {
4043				mx := len(str)
4044				if mx > 50 {
4045					mx = 50
4046				}
4047				item := str[:mx]
4048				str = str[mx:]
4049				ok = true
4050				buffer.WriteString(between)
4051				buffer.WriteString(item)
4052				between = sep
4053			}
4054		})
4055	case NCBI2NA:
4056		processElement(func(str string) {
4057			if str != "" {
4058				src := []byte(str)
4059				dst := make([]byte, hex.DecodedLen(len(src)))
4060				n, err := hex.Decode(dst, src)
4061				if err == nil {
4062					dst = dst[:n]
4063					ok = true
4064					buffer.WriteString(between)
4065					for _, byt := range dst {
4066						tmp := ncbi2naToIupac[int(byt)]
4067						buffer.WriteString(tmp)
4068					}
4069					between = sep
4070				}
4071			}
4072		})
4073	case NCBI4NA:
4074		processElement(func(str string) {
4075			if str != "" {
4076				src := []byte(str)
4077				dst := make([]byte, hex.DecodedLen(len(src)))
4078				n, err := hex.Decode(dst, src)
4079				if err == nil {
4080					dst = dst[:n]
4081					ok = true
4082					buffer.WriteString(between)
4083					for _, byt := range dst {
4084						tmp := ncbi4naToIupac[int(byt)]
4085						buffer.WriteString(tmp)
4086					}
4087					between = sep
4088				}
4089			}
4090		})
4091	case MOLWT:
4092		processElement(func(str string) {
4093			if str != "" {
4094				ok = true
4095				buffer.WriteString(between)
4096				str = eutils.ProteinWeight(str, true)
4097				buffer.WriteString(str)
4098				between = sep
4099			}
4100		})
4101	case HGVS:
4102		processElement(func(str string) {
4103			if str != "" {
4104				ok = true
4105				buffer.WriteString(between)
4106				str = eutils.ParseHGVS(str)
4107				buffer.WriteString(str)
4108				between = sep
4109			}
4110		})
4111	case INDICES:
4112		norm := make(map[string][]string)
4113		stem := make(map[string][]string)
4114
4115		cumulative := 0
4116
4117		// mutex for inverted index
4118		var ilock sync.Mutex
4119
4120		addItem := func(field map[string][]string, term string, position int) {
4121
4122			// protect with mutex
4123			ilock.Lock()
4124
4125			arry, found := field[term]
4126			if !found {
4127				arry = make([]string, 0, 1)
4128			}
4129			arry = append(arry, strconv.Itoa(position))
4130			field[term] = arry
4131
4132			ilock.Unlock()
4133		}
4134
4135		processElement(func(str string) {
4136
4137			if str == "" {
4138				return
4139			}
4140
4141			if str == "[Not Available]." {
4142				return
4143			}
4144
4145			if eutils.IsNotASCII(str) {
4146				str = eutils.DoAccentTransform(str)
4147				if eutils.HasUnicodeMarkup(str) {
4148					str = eutils.RepairUnicodeMarkup(str, eutils.SPACE)
4149				}
4150			}
4151
4152			str = strings.ToLower(str)
4153
4154			if eutils.HasBadSpace(str) {
4155				str = eutils.CleanupBadSpaces(str)
4156			}
4157			if eutils.HasAngleBracket(str) {
4158				str = eutils.RepairEncodedMarkup(str)
4159				str = eutils.RepairTableMarkup(str, eutils.SPACE)
4160				str = eutils.RepairScriptMarkup(str, eutils.SPACE)
4161				str = eutils.RepairMathMLMarkup(str, eutils.SPACE)
4162				// RemoveEmbeddedMarkup must be called before UnescapeString, which was suppressed in eutils.ExploreElements
4163				str = eutils.RemoveEmbeddedMarkup(str)
4164			}
4165
4166			if eutils.HasAmpOrNotASCII(str) {
4167				str = html.UnescapeString(str)
4168				str = strings.ToLower(str)
4169			}
4170
4171			if eutils.IsNotASCII(str) {
4172				if eutils.HasGreek(str) {
4173					str = eutils.SpellGreek(str)
4174					str = eutils.CompressRunsOfSpaces(str)
4175				}
4176			}
4177
4178			str = strings.Replace(str, "(", " ", -1)
4179			str = strings.Replace(str, ")", " ", -1)
4180
4181			str = strings.Replace(str, "_", " ", -1)
4182
4183			if eutils.HasHyphenOrApostrophe(str) {
4184				str = eutils.FixSpecialCases(str)
4185			}
4186
4187			str = strings.Replace(str, "-", " ", -1)
4188
4189			// remove trailing punctuation from each word
4190			var arry []string
4191
4192			terms := strings.Fields(str)
4193			for _, item := range terms {
4194				max := len(item)
4195				for max > 1 {
4196					ch := item[max-1]
4197					if ch != '.' && ch != ',' && ch != ':' && ch != ';' {
4198						break
4199					}
4200					// trim trailing period, comma, colon, and semicolon
4201					item = item[:max-1]
4202					// continue checking for runs of punctuation at end
4203					max--
4204				}
4205				if item == "" {
4206					continue
4207				}
4208				arry = append(arry, item)
4209			}
4210
4211			// rejoin into string
4212			cleaned := strings.Join(arry, " ")
4213
4214			// break clauses at punctuation other than space or underscore, and at non-ASCII characters
4215			clauses := strings.FieldsFunc(cleaned, func(c rune) bool {
4216				return (!unicode.IsLetter(c) && !unicode.IsDigit(c)) && c != ' ' && c != '_' || c > 127
4217			})
4218
4219			// space replaces plus sign to separate runs of unpunctuated words
4220			phrases := strings.Join(clauses, " ")
4221
4222			// break phrases into individual words
4223			words := strings.Fields(phrases)
4224
4225			for _, item := range words {
4226
4227				cumulative++
4228
4229				// skip at site of punctuation break
4230				if item == "+" {
4231					continue
4232				}
4233
4234				// skip terms that are all digits
4235				if eutils.IsAllDigitsOrPeriod(item) {
4236					continue
4237				}
4238
4239				// optional stop word removal
4240				if deStop && eutils.IsStopWord(item) {
4241					continue
4242				}
4243
4244				// index single normalized term
4245				addItem(norm, item, cumulative)
4246				ok = true
4247
4248				// apply stemming algorithm
4249				item = porter2.Stem(item)
4250				item = strings.TrimSpace(item)
4251				addItem(stem, item, cumulative)
4252			}
4253
4254			// pad to avoid false positive proximity match of words in adjacent paragraphs
4255			rounded := ((cumulative + 99) / 100) * 100
4256			if rounded-cumulative < 20 {
4257				rounded += 100
4258			}
4259			cumulative = rounded
4260		})
4261
4262		prepareIndices := func(field map[string][]string, label string) {
4263
4264			if len(field) < 1 {
4265				return
4266			}
4267
4268			var arry []string
4269
4270			for item := range field {
4271				arry = append(arry, item)
4272			}
4273
4274			sort.Slice(arry, func(i, j int) bool { return arry[i] < arry[j] })
4275
4276			last := ""
4277			for _, item := range arry {
4278				item = strings.TrimSpace(item)
4279				if item == "" {
4280					continue
4281				}
4282				if item == last {
4283					// skip duplicate entry
4284					continue
4285				}
4286				buffer.WriteString("      <")
4287				buffer.WriteString(label)
4288				if len(field[item]) > 0 {
4289					buffer.WriteString(" pos=\"")
4290					attr := strings.Join(field[item], ",")
4291					buffer.WriteString(attr)
4292					buffer.WriteString("\"")
4293				}
4294				buffer.WriteString(">")
4295				buffer.WriteString(item)
4296				buffer.WriteString("</")
4297				buffer.WriteString(label)
4298				buffer.WriteString(">\n")
4299				last = item
4300			}
4301		}
4302
4303		if ok {
4304			prepareIndices(norm, "NORM")
4305			prepareIndices(stem, "STEM")
4306		}
4307	case TERMS:
4308		processElement(func(str string) {
4309			if str != "" {
4310
4311				terms := strings.Fields(str)
4312				for _, item := range terms {
4313					max := len(item)
4314					for max > 1 {
4315						ch := item[max-1]
4316						if ch != '.' && ch != ',' && ch != ':' && ch != ';' {
4317							break
4318						}
4319						// trim trailing period, comma, colon, and semicolon
4320						item = item[:max-1]
4321						// continue checking for runs of punctuation at end
4322						max--
4323					}
4324					if item == "" {
4325						continue
4326					}
4327					ok = true
4328					buffer.WriteString(between)
4329					buffer.WriteString(item)
4330					between = sep
4331				}
4332			}
4333		})
4334	case WORDS:
4335		processElement(func(str string) {
4336			if str != "" {
4337
4338				words := strings.FieldsFunc(str, func(c rune) bool {
4339					return !unicode.IsLetter(c) && !unicode.IsDigit(c)
4340				})
4341				for _, item := range words {
4342					item = strings.ToLower(item)
4343					if deStop {
4344						if eutils.IsStopWord(item) {
4345							continue
4346						}
4347					}
4348					if doStem {
4349						item = porter2.Stem(item)
4350						item = strings.TrimSpace(item)
4351					}
4352					if item == "" {
4353						continue
4354					}
4355					ok = true
4356					buffer.WriteString(between)
4357					buffer.WriteString(item)
4358					between = sep
4359				}
4360			}
4361		})
4362	case PAIRS:
4363		processElement(func(str string) {
4364			if str != "" {
4365
4366				// break clauses at punctuation other than space or underscore, and at non-ASCII characters
4367				clauses := strings.FieldsFunc(str, func(c rune) bool {
4368					return (!unicode.IsLetter(c) && !unicode.IsDigit(c)) && c != ' ' || c > 127
4369				})
4370
4371				// plus sign separates runs of unpunctuated words
4372				phrases := strings.Join(clauses, " + ")
4373
4374				// break phrases into individual words
4375				words := strings.FieldsFunc(phrases, func(c rune) bool {
4376					return !unicode.IsLetter(c) && !unicode.IsDigit(c)
4377				})
4378
4379				if len(words) > 1 {
4380					past := ""
4381					for _, item := range words {
4382						if item == "+" {
4383							past = ""
4384							continue
4385						}
4386						item = strings.ToLower(item)
4387						if deStop {
4388							if eutils.IsStopWord(item) {
4389								past = ""
4390								continue
4391							}
4392						}
4393						if doStem {
4394							item = porter2.Stem(item)
4395							item = strings.TrimSpace(item)
4396						}
4397						if item == "" {
4398							past = ""
4399							continue
4400						}
4401						if past != "" {
4402							ok = true
4403							buffer.WriteString(between)
4404							buffer.WriteString(past + " " + item)
4405							between = sep
4406						}
4407						past = item
4408					}
4409				}
4410			}
4411		})
4412	case REVERSE:
4413		processElement(func(str string) {
4414			if str != "" {
4415
4416				words := strings.FieldsFunc(str, func(c rune) bool {
4417					return !unicode.IsLetter(c) && !unicode.IsDigit(c)
4418				})
4419				for lf, rt := 0, len(words)-1; lf < rt; lf, rt = lf+1, rt-1 {
4420					words[lf], words[rt] = words[rt], words[lf]
4421				}
4422				for _, item := range words {
4423					item = strings.ToLower(item)
4424					if deStop {
4425						if eutils.IsStopWord(item) {
4426							continue
4427						}
4428					}
4429					if doStem {
4430						item = porter2.Stem(item)
4431						item = strings.TrimSpace(item)
4432					}
4433					if item == "" {
4434						continue
4435					}
4436					ok = true
4437					buffer.WriteString(between)
4438					buffer.WriteString(item)
4439					between = sep
4440				}
4441			}
4442		})
4443	case LETTERS:
4444		processElement(func(str string) {
4445			if str != "" {
4446				for _, ch := range str {
4447					ok = true
4448					buffer.WriteString(between)
4449					buffer.WriteRune(ch)
4450					between = sep
4451				}
4452			}
4453		})
4454	case CLAUSES:
4455		processElement(func(str string) {
4456			if str != "" {
4457
4458				clauses := strings.FieldsFunc(str, func(c rune) bool {
4459					return c == '.' || c == ',' || c == ';' || c == ':'
4460				})
4461				for _, item := range clauses {
4462					item = strings.ToLower(item)
4463					item = strings.TrimSpace(item)
4464					if item == "" {
4465						continue
4466					}
4467					ok = true
4468					buffer.WriteString(between)
4469					buffer.WriteString(item)
4470					between = sep
4471				}
4472			}
4473		})
4474	case MESHCODE:
4475		var code []string
4476		var tree []string
4477
4478		processElement(func(str string) {
4479			if str != "" {
4480				txt, found := transform[str]
4481				str = strings.ToLower(str)
4482				code = append(code, str)
4483				ok = true
4484
4485				if !found {
4486					return
4487				}
4488				txt = strings.ToLower(txt)
4489				txt = strings.Replace(txt, ".", "_", -1)
4490				codes := strings.FieldsFunc(txt, func(c rune) bool {
4491					return c == ','
4492				})
4493				for _, item := range codes {
4494					ch := item[0]
4495					if item == "" {
4496						continue
4497					}
4498					switch ch {
4499					case 'a', 'c', 'd', 'e', 'f', 'g', 'z':
4500						tree = append(tree, item)
4501					default:
4502					}
4503				}
4504			}
4505		})
4506
4507		if len(code) > 1 {
4508			sort.Slice(code, func(i, j int) bool { return code[i] < code[j] })
4509		}
4510		if len(tree) > 1 {
4511			sort.Slice(tree, func(i, j int) bool { return tree[i] < tree[j] })
4512		}
4513
4514		last := ""
4515		for _, item := range code {
4516			if item == last {
4517				// skip duplicate entry
4518				continue
4519			}
4520			buffer.WriteString("      <CODE>")
4521			buffer.WriteString(item)
4522			buffer.WriteString("</CODE>\n")
4523			last = item
4524		}
4525
4526		last = ""
4527		for _, item := range tree {
4528			if item == last {
4529				// skip duplicate entry
4530				continue
4531			}
4532			buffer.WriteString("      <TREE>")
4533			buffer.WriteString(item)
4534			buffer.WriteString("</TREE>\n")
4535			last = item
4536		}
4537	case MATRIX:
4538		var arry []string
4539
4540		processElement(func(str string) {
4541			if str != "" {
4542				txt, found := transform[str]
4543				if found {
4544					str = txt
4545				}
4546				arry = append(arry, str)
4547				ok = true
4548			}
4549		})
4550
4551		if len(arry) > 1 {
4552			sort.Slice(arry, func(i, j int) bool { return arry[i] < arry[j] })
4553
4554			for i, frst := range arry {
4555				for j, scnd := range arry {
4556					if i == j {
4557						continue
4558					}
4559					buffer.WriteString(between)
4560					buffer.WriteString(frst)
4561					buffer.WriteString("\t")
4562					buffer.WriteString(scnd)
4563					between = "\n"
4564				}
4565			}
4566		}
4567	case HISTOGRAM:
4568		processElement(func(str string) {
4569			if str != "" {
4570				ok = true
4571
4572				hlock.Lock()
4573
4574				val := histogram[str]
4575				val++
4576				histogram[str] = val
4577
4578				hlock.Unlock()
4579			}
4580		})
4581	case ACCENTED:
4582		processElement(func(str string) {
4583			if str != "" {
4584				found := false
4585				for _, ch := range str {
4586					if ch > 127 {
4587						found = true
4588						break
4589					}
4590				}
4591				if found {
4592					ok = true
4593					buffer.WriteString(between)
4594					buffer.WriteString(str)
4595					between = sep
4596				}
4597			}
4598		})
4599	default:
4600	}
4601
4602	// use default value if nothing written
4603	if !ok && def != "" {
4604		ok = true
4605		buffer.WriteString(def)
4606	}
4607
4608	buffer.WriteString(sfx)
4609
4610	if !ok {
4611		return "", false
4612	}
4613
4614	txt := buffer.String()
4615
4616	return txt, true
4617}
4618
4619// processInstructions performs extraction commands on a subset of XML
4620func processInstructions(commands []*Operation, curr *eutils.XMLNode, mask, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) {
4621
4622	if accum == nil {
4623		return tab, ret
4624	}
4625
4626	sep := "\t"
4627	pfx := ""
4628	sfx := ""
4629	plg := ""
4630	elg := ""
4631	lst := ""
4632
4633	def := ""
4634
4635	reg := ""
4636	exp := ""
4637
4638	col := "\t"
4639	lin := "\n"
4640
4641	varname := ""
4642	isAccum := false
4643
4644	wrp := false
4645
4646	plain := true
4647	var currColor *color.Color
4648
4649	// handles color, e.g., -color "red,bold", reset to plain by -color "-" (undocumented)
4650	printInColor := func(str string) {
4651		if plain || currColor == nil {
4652			accum(str)
4653		} else {
4654			tx := currColor.SprintFunc()
4655			tmp := fmt.Sprintf("%s", tx(str))
4656			accum(tmp)
4657		}
4658	}
4659
4660	// process commands
4661	for _, op := range commands {
4662
4663		str := op.Value
4664
4665		switch op.Type {
4666		case ELEMENT, FIRST, LAST, ENCODE, DECODE, PLAIN, UPPER, LOWER, CHAIN, TITLE, ORDER, YEAR, DOI, TRANSLATE, REPLACE,
4667			TERMS, WORDS, PAIRS, REVERSE, LETTERS, CLAUSES, INDICES, MESHCODE, MATRIX, ACCENTED,
4668			NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, MED, MUL, DIV, MOD, BIN, BIT, ZEROBASED, ONEBASED, UCSCBASED,
4669			REVCOMP, NUCLEIC, FASTA, NCBI2NA, NCBI4NA, MOLWT, HGVS:
4670			txt, ok := processClause(curr, op.Stages, mask, tab, pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram)
4671			if ok {
4672				plg = ""
4673				lst = elg
4674				tab = col
4675				ret = lin
4676				if plain {
4677					accum(txt)
4678				} else {
4679					printInColor(txt)
4680				}
4681			}
4682		case HISTOGRAM:
4683			txt, ok := processClause(curr, op.Stages, mask, "", "", "", "", "", "", "", "", wrp, op.Type, index, level, variables, transform, histogram)
4684			if ok {
4685				accum(txt)
4686			}
4687		case TAB:
4688			col = str
4689		case RET:
4690			lin = str
4691		case PFX:
4692			pfx = str
4693		case SFX:
4694			sfx = str
4695		case SEP:
4696			sep = str
4697		case LBL:
4698			lbl := str
4699			accum(tab)
4700			accum(plg)
4701			accum(pfx)
4702			if plain {
4703				accum(lbl)
4704			} else {
4705				printInColor(lbl)
4706			}
4707			accum(sfx)
4708			plg = ""
4709			lst = elg
4710			tab = col
4711			ret = lin
4712		case PFC:
4713			// preface clears previous tab and sets prefix in one command
4714			pfx = str
4715			fallthrough
4716		case CLR:
4717			// clear previous tab after the fact
4718			tab = ""
4719		case DEQ:
4720			// set queued tab after the fact
4721			tab = str
4722		case PLG:
4723			plg = str
4724		case ELG:
4725			elg = str
4726		case WRP:
4727			// shortcut to wrap elements in XML tags
4728			if str == "" || str == "-" {
4729				sep = "\t"
4730				pfx = ""
4731				sfx = ""
4732				plg = ""
4733				elg = ""
4734				wrp = false
4735				break
4736			}
4737			// -wrp with comma-separated arguments is deprecated, but supported for backward compatibility
4738			lft, rgt := eutils.SplitInTwoRight(str, ",")
4739			if lft != "" {
4740				plg = "<" + lft + ">"
4741				elg = "</" + lft + ">"
4742			}
4743			if rgt != "" && rgt != "-" {
4744				pfx = "<" + rgt + ">"
4745				sfx = "</" + rgt + ">"
4746				sep = "</" + rgt + "><" + rgt + ">"
4747			}
4748			wrp = true
4749		case ENC:
4750			// shortcut to mark unexpanded instances with XML tags
4751			plg = ""
4752			elg = ""
4753			if str != "" && str != "-" {
4754				items := strings.Split(str, "/")
4755				for i := 0; i < len(items); i++ {
4756					plg += "<" + items[i] + ">"
4757				}
4758				for i := len(items) - 1; i >= 0; i-- {
4759					elg += "</" + items[i] + ">"
4760				}
4761			}
4762		case RST:
4763			pfx = ""
4764			sfx = ""
4765			plg = ""
4766			elg = ""
4767			sep = "\t"
4768			def = ""
4769			wrp = false
4770		case DEF:
4771			def = str
4772		case REG:
4773			reg = str
4774		case EXP:
4775			exp = str
4776		case COLOR:
4777			currColor = color.New()
4778			if str == "-" || str == "reset" || str == "clear" {
4779				plain = true
4780				break
4781			}
4782			plain = false
4783			items := strings.Split(str, ",")
4784			for _, itm := range items {
4785				switch itm {
4786				case "red":
4787					currColor.Add(color.FgRed)
4788				case "grn", "green":
4789					currColor.Add(color.FgGreen)
4790				case "blu", "blue":
4791					currColor.Add(color.FgBlue)
4792				case "blk", "black":
4793					currColor.Add(color.FgBlack)
4794				case "bld", "bold":
4795					currColor.Add(color.Bold)
4796				case "ital", "italic", "italics":
4797					currColor.Add(color.Italic)
4798				case "blink", "flash":
4799					currColor.Add(color.BlinkSlow)
4800				default:
4801					fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized color argument '%s'\n", itm)
4802					os.Exit(1)
4803				}
4804			}
4805		case ACCUMULATOR:
4806			isAccum = true
4807			varname = str
4808		case VARIABLE:
4809			isAccum = false
4810			varname = str
4811		case VALUE:
4812			length := len(str)
4813			if length > 1 && str[0] == '(' && str[length-1] == ')' {
4814				// set variable from literal text inside parentheses, e.g., -COM "(, )"
4815				variables[varname] = str[1 : length-1]
4816				// -if "&VARIABLE" will succeed if set to blank with empty parentheses "()"
4817			} else if str == "" {
4818				// -if "&VARIABLE" will fail if initialized with empty string ""
4819				delete(variables, varname)
4820			} else {
4821				txt, ok := processClause(curr, op.Stages, mask, "", pfx, sfx, plg, sep, def, reg, exp, wrp, op.Type, index, level, variables, transform, histogram)
4822				if ok {
4823					plg = ""
4824					lst = elg
4825					if isAccum {
4826						if variables[varname] == "" {
4827							variables[varname] = txt
4828						} else {
4829							variables[varname] += sep + txt
4830						}
4831					} else {
4832						variables[varname] = txt
4833					}
4834				}
4835			}
4836			varname = ""
4837			isAccum = false
4838		default:
4839		}
4840	}
4841
4842	if plain {
4843		accum(lst)
4844	} else {
4845		printInColor(lst)
4846	}
4847
4848	return tab, ret
4849}
4850
4851// CONDITIONAL EXECUTION USES -if AND -unless STATEMENT, WITH SUPPORT FOR DEPRECATED -match AND -avoid STATEMENTS
4852
4853// conditionsAreSatisfied tests a set of conditions to determine if extraction should proceed
4854func conditionsAreSatisfied(conditions []*Operation, curr *eutils.XMLNode, mask string, index, level int, variables map[string]string) bool {
4855
4856	if curr == nil {
4857		return false
4858	}
4859
4860	required := 0
4861	observed := 0
4862	forbidden := 0
4863	isMatch := false
4864	isAvoid := false
4865
4866	// matchFound tests individual conditions
4867	matchFound := func(stages []*Step) bool {
4868
4869		if stages == nil || len(stages) < 1 {
4870			return false
4871		}
4872
4873		stage := stages[0]
4874
4875		var constraint *Step
4876
4877		if len(stages) > 1 {
4878			constraint = stages[1]
4879		}
4880
4881		status := stage.Type
4882		prnt := stage.Parent
4883		match := stage.Match
4884		attrib := stage.Attrib
4885		typL := stage.TypL
4886		strL := stage.StrL
4887		intL := stage.IntL
4888		typR := stage.TypR
4889		strR := stage.StrR
4890		intR := stage.IntR
4891		norm := stage.Norm
4892		wildcard := stage.Wild
4893		unescape := true
4894
4895		found := false
4896		number := ""
4897
4898		// exploreElements is a wrapper for eutils.ExploreElements, obtaining most arguments as closures
4899		exploreElements := func(proc func(string, int)) {
4900			eutils.ExploreElements(curr, mask, prnt, match, attrib, wildcard, unescape, level, proc)
4901		}
4902
4903		// test string or numeric constraints
4904		testConstraint := func(str string) bool {
4905
4906			if str == "" || constraint == nil {
4907				return false
4908			}
4909
4910			val := constraint.Value
4911			stat := constraint.Type
4912
4913			switch stat {
4914			case EQUALS, CONTAINS, ISWITHIN, STARTSWITH, ENDSWITH, ISNOT, ISBEFORE, ISAFTER, MATCHES, RESEMBLES:
4915				// substring test on element values
4916				str = strings.ToUpper(str)
4917				val = strings.ToUpper(val)
4918
4919				switch stat {
4920				case EQUALS:
4921					if str == val {
4922						return true
4923					}
4924				case CONTAINS:
4925					if strings.Contains(str, val) {
4926						return true
4927					}
4928				case ISWITHIN:
4929					if strings.Contains(val, str) {
4930						return true
4931					}
4932				case STARTSWITH:
4933					if strings.HasPrefix(str, val) {
4934						return true
4935					}
4936				case ENDSWITH:
4937					if strings.HasSuffix(str, val) {
4938						return true
4939					}
4940				case ISNOT:
4941					if str != val {
4942						return true
4943					}
4944				case ISBEFORE:
4945					if str < val {
4946						return true
4947					}
4948				case ISAFTER:
4949					if str > val {
4950						return true
4951					}
4952				case MATCHES:
4953					if eutils.RemoveCommaOrSemicolon(str) == strings.ToLower(val) {
4954						return true
4955					}
4956				case RESEMBLES:
4957					if sortStringByWords(str) == strings.ToLower(val) {
4958						return true
4959					}
4960				default:
4961				}
4962			case ISEQUALTO, DIFFERSFROM:
4963				// conditional argument is element specifier
4964				if constraint.Parent != "" || constraint.Match != "" || constraint.Attrib != "" {
4965					ch := val[0]
4966					// pound, percent, and caret prefixes supported (undocumented)
4967					switch ch {
4968					case '#':
4969						count := 0
4970						eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
4971							count++
4972						})
4973						val = strconv.Itoa(count)
4974					case '%':
4975						length := 0
4976						eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
4977							if stn != "" {
4978								length += len(stn)
4979							}
4980						})
4981						val = strconv.Itoa(length)
4982					case '^':
4983						depth := 0
4984						eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
4985							depth = lvl
4986						})
4987						val = strconv.Itoa(depth)
4988					default:
4989						eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
4990							if stn != "" {
4991								val = stn
4992							}
4993						})
4994					}
4995				}
4996				str = strings.ToUpper(str)
4997				val = strings.ToUpper(val)
4998
4999				switch stat {
5000				case ISEQUALTO:
5001					if str == val {
5002						return true
5003					}
5004				case DIFFERSFROM:
5005					if str != val {
5006						return true
5007					}
5008				default:
5009				}
5010			case GT, GE, LT, LE, EQ, NE:
5011				// second argument of numeric test can be element specifier
5012				if constraint.Parent != "" || constraint.Match != "" || constraint.Attrib != "" {
5013					ch := val[0]
5014					// pound, percent, and caret prefixes supported as potentially useful for data QA (undocumented)
5015					switch ch {
5016					case '#':
5017						count := 0
5018						eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
5019							count++
5020						})
5021						val = strconv.Itoa(count)
5022					case '%':
5023						length := 0
5024						eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
5025							if stn != "" {
5026								length += len(stn)
5027							}
5028						})
5029						val = strconv.Itoa(length)
5030					case '^':
5031						depth := 0
5032						eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
5033							depth = lvl
5034						})
5035						val = strconv.Itoa(depth)
5036					default:
5037						eutils.ExploreElements(curr, mask, constraint.Parent, constraint.Match, constraint.Attrib, constraint.Wild, true, level, func(stn string, lvl int) {
5038							if stn != "" {
5039								_, errz := strconv.Atoi(stn)
5040								if errz == nil {
5041									val = stn
5042								}
5043							}
5044						})
5045					}
5046				}
5047
5048				// numeric tests on element values
5049				x, errx := strconv.Atoi(str)
5050				y, erry := strconv.Atoi(val)
5051
5052				// both arguments must resolve to integers
5053				if errx != nil || erry != nil {
5054					return false
5055				}
5056
5057				switch stat {
5058				case GT:
5059					if x > y {
5060						return true
5061					}
5062				case GE:
5063					if x >= y {
5064						return true
5065					}
5066				case LT:
5067					if x < y {
5068						return true
5069					}
5070				case LE:
5071					if x <= y {
5072						return true
5073					}
5074				case EQ:
5075					if x == y {
5076						return true
5077					}
5078				case NE:
5079					if x != y {
5080						return true
5081					}
5082				default:
5083				}
5084			default:
5085			}
5086
5087			return false
5088		}
5089
5090		// checkConstraint applies optional [min:max] range restriction and sends result to testConstraint
5091		checkConstraint := func(str string) bool {
5092
5093			// handle usual situation with no range first
5094			if norm {
5095				return testConstraint(str)
5096			}
5097
5098			// check for [after|before] variant
5099			if typL == STRINGRANGE || typR == STRINGRANGE {
5100				if strL != "" {
5101					// use case-insensitive test
5102					strL = strings.ToUpper(strL)
5103					idx := strings.Index(strings.ToUpper(str), strL)
5104					if idx < 0 {
5105						// specified substring must be present in original string
5106						return false
5107					}
5108					ln := len(strL)
5109					// remove leading text
5110					str = str[idx+ln:]
5111				}
5112				if strR != "" {
5113					strR = strings.ToUpper(strR)
5114					idx := strings.Index(strings.ToUpper(str), strR)
5115					if idx < 0 {
5116						// specified substring must be present in remaining string
5117						return false
5118					}
5119					// remove trailing text
5120					str = str[:idx]
5121				}
5122				if str != "" {
5123					return testConstraint(str)
5124				}
5125				return false
5126			}
5127
5128			min := 0
5129			max := 0
5130
5131			// slice arguments use variable value +- adjustment or integer constant
5132			if typL == VARIABLERANGE {
5133				if strL == "" {
5134					return false
5135				}
5136				lft, ok := variables[strL]
5137				if !ok {
5138					return false
5139				}
5140				val, err := strconv.Atoi(lft)
5141				if err != nil {
5142					return false
5143				}
5144				// range argument values are inclusive and 1-based, decrement variable start +- offset to use in slice
5145				min = val + intL - 1
5146			} else if typL == INTEGERRANGE {
5147				// range argument values are inclusive and 1-based, decrement literal start to use in slice
5148				min = intL - 1
5149			}
5150			if typR == VARIABLERANGE {
5151				if strR == "" {
5152					return false
5153				}
5154				rgt, ok := variables[strR]
5155				if !ok {
5156					return false
5157				}
5158				val, err := strconv.Atoi(rgt)
5159				if err != nil {
5160					return false
5161				}
5162				if val+intR < 0 {
5163					// negative value is 1-based inset from end of string (undocumented)
5164					max = len(str) + val + intR + 1
5165				} else {
5166					max = val + intR
5167				}
5168			} else if typR == INTEGERRANGE {
5169				if intR < 0 {
5170					// negative max is inset from end of string (undocumented)
5171					max = len(str) + intR + 1
5172				} else {
5173					max = intR
5174				}
5175			}
5176
5177			// numeric range now calculated, apply slice to string
5178			if min == 0 && max == 0 {
5179				return testConstraint(str)
5180			} else if max == 0 {
5181				if min > 0 && min < len(str) {
5182					str = str[min:]
5183					if str != "" {
5184						return testConstraint(str)
5185					}
5186				}
5187			} else if min == 0 {
5188				if max > 0 && max <= len(str) {
5189					str = str[:max]
5190					if str != "" {
5191						return testConstraint(str)
5192					}
5193				}
5194			} else {
5195				if min < max && min > 0 && max <= len(str) {
5196					str = str[min:max]
5197					if str != "" {
5198						return testConstraint(str)
5199					}
5200				}
5201			}
5202
5203			return false
5204		}
5205
5206		switch status {
5207		case ELEMENT:
5208			exploreElements(func(str string, lvl int) {
5209				// match to XML container object sends empty string, so do not check for str != "" here
5210				// test every selected element individually if value is specified
5211				if constraint == nil || checkConstraint(str) {
5212					found = true
5213				}
5214			})
5215		case VARIABLE:
5216			// use value of stored variable
5217			str, ok := variables[match]
5218			if ok {
5219				//  -if &VARIABLE -equals VALUE is the supported construct
5220				if constraint == nil || checkConstraint(str) {
5221					found = true
5222				}
5223			}
5224		case COUNT:
5225			count := 0
5226
5227			exploreElements(func(str string, lvl int) {
5228				count++
5229				found = true
5230			})
5231
5232			// number of element objects
5233			number = strconv.Itoa(count)
5234		case LENGTH:
5235			length := 0
5236
5237			exploreElements(func(str string, lvl int) {
5238				length += len(str)
5239				found = true
5240			})
5241
5242			// length of element strings
5243			number = strconv.Itoa(length)
5244		case DEPTH:
5245			depth := 0
5246
5247			exploreElements(func(str string, lvl int) {
5248				depth = lvl
5249				found = true
5250			})
5251
5252			// depth of last element in scope
5253			number = strconv.Itoa(depth)
5254		case INDEX:
5255			// index of explored parent object
5256			number = strconv.Itoa(index)
5257			found = true
5258		default:
5259		}
5260
5261		if number == "" {
5262			return found
5263		}
5264
5265		if constraint == nil || checkConstraint(number) {
5266			return true
5267		}
5268
5269		return false
5270	}
5271
5272	// test conditional arguments
5273	for _, op := range conditions {
5274
5275		switch op.Type {
5276		// -if tests for presence of element (deprecated -match can test element:value)
5277		case SELECT, IF, MATCH:
5278			// checking for failure here allows for multiple -if [ -and / -or ] clauses
5279			if isMatch && observed < required {
5280				return false
5281			}
5282			if isAvoid && forbidden > 0 {
5283				return false
5284			}
5285			required = 0
5286			observed = 0
5287			forbidden = 0
5288			isMatch = true
5289			isAvoid = false
5290			// continue on to next two cases
5291			fallthrough
5292		case AND:
5293			required++
5294			// continue on to next case
5295			fallthrough
5296		case OR:
5297			if matchFound(op.Stages) {
5298				observed++
5299				// record presence of forbidden element if in -unless clause
5300				forbidden++
5301			}
5302		// -unless tests for absence of element, or presence but with failure of subsequent value test (deprecated -avoid can test element:value)
5303		case UNLESS, AVOID:
5304			if isMatch && observed < required {
5305				return false
5306			}
5307			if isAvoid && forbidden > 0 {
5308				return false
5309			}
5310			required = 0
5311			observed = 0
5312			forbidden = 0
5313			isMatch = false
5314			isAvoid = true
5315			if matchFound(op.Stages) {
5316				forbidden++
5317			}
5318		default:
5319		}
5320	}
5321
5322	if isMatch && observed < required {
5323		return false
5324	}
5325	if isAvoid && forbidden > 0 {
5326		return false
5327	}
5328
5329	return true
5330}
5331
5332// RECURSIVELY PROCESS EXPLORATION COMMANDS AND XML DATA STRUCTURE
5333
5334// processCommands visits XML nodes, performs conditional tests, and executes data extraction instructions
5335func processCommands(cmds *Block, curr *eutils.XMLNode, tab, ret string, index, level int, variables map[string]string, transform map[string]string, histogram map[string]int, accum func(string)) (string, string) {
5336
5337	if accum == nil {
5338		return tab, ret
5339	}
5340
5341	prnt := cmds.Parent
5342	match := cmds.Match
5343
5344	// leading colon indicates namespace prefix wildcard
5345	wildcard := false
5346	if strings.HasPrefix(prnt, ":") || strings.HasPrefix(match, ":") {
5347		wildcard = true
5348	}
5349
5350	// **/Object performs deep exploration of recursive data
5351	deep := false
5352	if prnt == "**" {
5353		prnt = "*"
5354		deep = true
5355	}
5356	// Object/** performs exhaustive exploration of nodes
5357	tall := false
5358	if match == "**" {
5359		match = "*"
5360		tall = true
5361	}
5362
5363	// closure passes local variables to callback, which can modify caller tab and ret values
5364	processNode := func(node *eutils.XMLNode, idx, lvl int) {
5365
5366		// apply -if or -unless tests
5367		if conditionsAreSatisfied(cmds.Conditions, node, match, idx, lvl, variables) {
5368
5369			// execute data extraction commands
5370			if len(cmds.Commands) > 0 {
5371				tab, ret = processInstructions(cmds.Commands, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum)
5372			}
5373
5374			// process sub commands on child node
5375			for _, sub := range cmds.Subtasks {
5376				tab, ret = processCommands(sub, node, tab, ret, 1, lvl, variables, transform, histogram, accum)
5377			}
5378
5379		} else {
5380
5381			// execute commands after -else statement
5382			if len(cmds.Failure) > 0 {
5383				tab, ret = processInstructions(cmds.Failure, node, match, tab, ret, idx, lvl, variables, transform, histogram, accum)
5384			}
5385		}
5386	}
5387
5388	// exploreNodes recursive definition
5389	var exploreNodes func(*eutils.XMLNode, int, int, bool, func(*eutils.XMLNode, int, int)) int
5390
5391	// exploreNodes visits all nodes that match the selection criteria
5392	exploreNodes = func(curr *eutils.XMLNode, indx, levl int, force bool, proc func(*eutils.XMLNode, int, int)) int {
5393
5394		if curr == nil || proc == nil {
5395			return indx
5396		}
5397
5398		// match is "*" for heterogeneous data constructs, e.g., -group PubmedArticleSet/*
5399		// wildcard matches any namespace prefix
5400		if curr.Name == match ||
5401			match == "*" ||
5402			(wildcard && strings.HasPrefix(match, ":") && strings.HasSuffix(curr.Name, match)) {
5403
5404			if prnt == "" ||
5405				curr.Parent == prnt ||
5406				force ||
5407				(wildcard && strings.HasPrefix(prnt, ":") && strings.HasSuffix(curr.Parent, prnt)) {
5408
5409				proc(curr, indx, levl)
5410				indx++
5411
5412				if tall && prnt != "" {
5413					// exhaustive exploration of child nodes within region of parent match
5414					for chld := curr.Children; chld != nil; chld = chld.Next {
5415						indx = exploreNodes(chld, indx, levl+1, true, proc)
5416					}
5417				}
5418
5419				if !deep {
5420					// do not explore within recursive object
5421					return indx
5422				}
5423			}
5424		}
5425
5426		// clearing prnt "*" now allows nested exploration within recursive data, e.g., -pattern Taxon -block */Taxon
5427		if prnt == "*" {
5428			prnt = ""
5429		}
5430
5431		// explore child nodes
5432		for chld := curr.Children; chld != nil; chld = chld.Next {
5433			indx = exploreNodes(chld, indx, levl+1, false, proc)
5434		}
5435
5436		return indx
5437	}
5438
5439	// explorePath recursive definition
5440	var explorePath func(*eutils.XMLNode, []string, int, int, func(*eutils.XMLNode, int, int)) int
5441
5442	// explorePath visits child nodes and matches against next entry in path
5443	explorePath = func(curr *eutils.XMLNode, path []string, indx, levl int, proc func(*eutils.XMLNode, int, int)) int {
5444
5445		if curr == nil || proc == nil {
5446			return indx
5447		}
5448
5449		if len(path) < 1 {
5450			proc(curr, indx, levl)
5451			indx++
5452			return indx
5453		}
5454
5455		name := path[0]
5456		rest := path[1:]
5457
5458		// explore next level of child nodes
5459		for chld := curr.Children; chld != nil; chld = chld.Next {
5460			if chld.Name == name {
5461				// recurse only if child matches next component in path
5462				indx = explorePath(chld, rest, indx, levl+1, proc)
5463			}
5464		}
5465
5466		return indx
5467	}
5468
5469	if cmds.Foreword != "" {
5470		accum(cmds.Foreword)
5471	}
5472
5473	// apply -position test
5474
5475	if cmds.Position == "" || cmds.Position == "all" {
5476
5477		exploreNodes(curr, index, level, false, processNode)
5478
5479	} else if cmds.Position == "path" {
5480
5481		exploreNodes(curr, index, level, false,
5482			func(node *eutils.XMLNode, idx, lvl int) {
5483				// exploreNodes callback has matched first path component, now explore remainder one level and component at a time
5484				explorePath(node, cmds.Path, idx, lvl, processNode)
5485			})
5486
5487	} else {
5488
5489		var single *eutils.XMLNode
5490		lev := 0
5491		ind := 0
5492
5493		if cmds.Position == "first" {
5494
5495			exploreNodes(curr, index, level, false,
5496				func(node *eutils.XMLNode, idx, lvl int) {
5497					if single == nil {
5498						single = node
5499						ind = idx
5500						lev = lvl
5501					}
5502				})
5503
5504		} else if cmds.Position == "last" {
5505
5506			exploreNodes(curr, index, level, false,
5507				func(node *eutils.XMLNode, idx, lvl int) {
5508					single = node
5509					ind = idx
5510					lev = lvl
5511				})
5512
5513		} else if cmds.Position == "outer" {
5514
5515			// print only first and last nodes
5516			var beg *Limiter
5517			var end *Limiter
5518
5519			exploreNodes(curr, index, level, false,
5520				func(node *eutils.XMLNode, idx, lvl int) {
5521					if beg == nil {
5522						beg = &Limiter{node, idx, lvl}
5523					} else {
5524						end = &Limiter{node, idx, lvl}
5525					}
5526				})
5527
5528			if beg != nil {
5529				processNode(beg.Obj, beg.Idx, beg.Lvl)
5530			}
5531			if end != nil {
5532				processNode(end.Obj, end.Idx, end.Lvl)
5533			}
5534
5535		} else if cmds.Position == "inner" {
5536
5537			// print all but first and last nodes
5538			var prev *Limiter
5539			var next *Limiter
5540			first := true
5541
5542			exploreNodes(curr, index, level, false,
5543				func(node *eutils.XMLNode, idx, lvl int) {
5544					if first {
5545						first = false
5546						return
5547					}
5548
5549					prev = next
5550					next = &Limiter{node, idx, lvl}
5551
5552					if prev != nil {
5553						processNode(prev.Obj, prev.Idx, prev.Lvl)
5554					}
5555				})
5556
5557		} else if cmds.Position == "even" {
5558
5559			okay := false
5560
5561			exploreNodes(curr, index, level, false,
5562				func(node *eutils.XMLNode, idx, lvl int) {
5563					if okay {
5564						processNode(node, idx, lvl)
5565					}
5566					okay = !okay
5567				})
5568
5569		} else if cmds.Position == "odd" {
5570
5571			okay := true
5572
5573			exploreNodes(curr, index, level, false,
5574				func(node *eutils.XMLNode, idx, lvl int) {
5575					if okay {
5576						processNode(node, idx, lvl)
5577					}
5578					okay = !okay
5579				})
5580
5581		} else {
5582
5583			// use numeric position
5584			number, err := strconv.Atoi(cmds.Position)
5585			if err == nil {
5586
5587				pos := 0
5588
5589				exploreNodes(curr, index, level, false,
5590					func(node *eutils.XMLNode, idx, lvl int) {
5591						pos++
5592						if pos == number {
5593							single = node
5594							ind = idx
5595							lev = lvl
5596						}
5597					})
5598
5599			} else {
5600
5601				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized position '%s'\n", cmds.Position)
5602				os.Exit(1)
5603			}
5604		}
5605
5606		if single != nil {
5607			processNode(single, ind, lev)
5608		}
5609	}
5610
5611	if cmds.Afterword != "" {
5612		accum(cmds.Afterword)
5613	}
5614
5615	return tab, ret
5616}
5617
5618// PROCESS ONE XML COMPONENT RECORD
5619
5620// processQuery perform data extraction driven by command-line arguments
5621func processQuery(text, parent string, index int, hd, tl string, transform map[string]string, histogram map[string]int, cmds *Block) string {
5622
5623	if text == "" || cmds == nil {
5624		return ""
5625	}
5626
5627	// exit from function will collect garbage of node structure for current XML object
5628	pat := eutils.ParseRecord(text, parent)
5629
5630	if pat == nil {
5631		return ""
5632	}
5633
5634	// exit from function will also free map of recorded variables for current -pattern
5635	variables := make(map[string]string)
5636
5637	var buffer strings.Builder
5638
5639	ok := false
5640
5641	if hd != "" {
5642		buffer.WriteString(hd)
5643	}
5644
5645	ret := ""
5646
5647	if cmds.Position == "select" {
5648
5649		if conditionsAreSatisfied(cmds.Conditions, pat, cmds.Match, index, 1, variables) {
5650			ok = true
5651			buffer.WriteString(text)
5652			ret = "\n"
5653		}
5654
5655	} else {
5656
5657		// start processing at top of command tree and top of XML subregion selected by -pattern
5658		_, ret = processCommands(cmds, pat, "", "", index, 1, variables, transform, histogram,
5659			func(str string) {
5660				if str != "" {
5661					ok = true
5662					buffer.WriteString(str)
5663				}
5664			})
5665	}
5666
5667	if tl != "" {
5668		buffer.WriteString(tl)
5669	}
5670
5671	if ret != "" {
5672		ok = true
5673		buffer.WriteString(ret)
5674	}
5675
5676	txt := buffer.String()
5677
5678	// remove leading newline (-insd -pfx artifact)
5679	if txt != "" && txt[0] == '\n' {
5680		txt = txt[1:]
5681	}
5682
5683	if !ok {
5684		return ""
5685	}
5686
5687	// return consolidated result string
5688	return txt
5689}
5690
5691// INSDSEQ EXTRACTION COMMAND GENERATOR
5692
5693// e.g., xtract -insd complete mat_peptide "%peptide" product peptide
5694
5695// processINSD generates extraction commands for GenBank/RefSeq records in INSDSet format
5696func processINSD(args []string, isPipe, addDash, doIndex bool) []string {
5697
5698	// legal GenBank / GenPept / RefSeq features
5699
5700	features := []string{
5701		"-10_signal",
5702		"-35_signal",
5703		"3'clip",
5704		"3'UTR",
5705		"5'clip",
5706		"5'UTR",
5707		"allele",
5708		"assembly_gap",
5709		"attenuator",
5710		"Bond",
5711		"C_region",
5712		"CAAT_signal",
5713		"CDS",
5714		"centromere",
5715		"conflict",
5716		"D_segment",
5717		"D-loop",
5718		"enhancer",
5719		"exon",
5720		"gap",
5721		"GC_signal",
5722		"gene",
5723		"iDNA",
5724		"intron",
5725		"J_segment",
5726		"LTR",
5727		"mat_peptide",
5728		"misc_binding",
5729		"misc_difference",
5730		"misc_feature",
5731		"misc_recomb",
5732		"misc_RNA",
5733		"misc_signal",
5734		"misc_structure",
5735		"mobile_element",
5736		"modified_base",
5737		"mRNA",
5738		"mutation",
5739		"N_region",
5740		"ncRNA",
5741		"old_sequence",
5742		"operon",
5743		"oriT",
5744		"polyA_signal",
5745		"polyA_site",
5746		"precursor_RNA",
5747		"prim_transcript",
5748		"primer_bind",
5749		"promoter",
5750		"propeptide",
5751		"protein_bind",
5752		"Protein",
5753		"RBS",
5754		"Region",
5755		"regulatory",
5756		"rep_origin",
5757		"repeat_region",
5758		"repeat_unit",
5759		"rRNA",
5760		"S_region",
5761		"satellite",
5762		"scRNA",
5763		"sig_peptide",
5764		"Site",
5765		"snoRNA",
5766		"snRNA",
5767		"source",
5768		"stem_loop",
5769		"STS",
5770		"TATA_signal",
5771		"telomere",
5772		"terminator",
5773		"tmRNA",
5774		"transit_peptide",
5775		"tRNA",
5776		"unsure",
5777		"V_region",
5778		"V_segment",
5779		"variation",
5780	}
5781
5782	// legal GenBank / GenPept / RefSeq qualifiers
5783
5784	qualifiers := []string{
5785		"allele",
5786		"altitude",
5787		"anticodon",
5788		"artificial_location",
5789		"bio_material",
5790		"bond_type",
5791		"bound_moiety",
5792		"breed",
5793		"calculated_mol_wt",
5794		"cell_line",
5795		"cell_type",
5796		"chloroplast",
5797		"chromoplast",
5798		"chromosome",
5799		"circular_RNA",
5800		"citation",
5801		"clone_lib",
5802		"clone",
5803		"coded_by",
5804		"codon_start",
5805		"codon",
5806		"collected_by",
5807		"collection_date",
5808		"compare",
5809		"cons_splice",
5810		"country",
5811		"cultivar",
5812		"culture_collection",
5813		"cyanelle",
5814		"db_xref",
5815		"derived_from",
5816		"dev_stage",
5817		"direction",
5818		"EC_number",
5819		"ecotype",
5820		"encodes",
5821		"endogenous_virus",
5822		"environmental_sample",
5823		"estimated_length",
5824		"evidence",
5825		"exception",
5826		"experiment",
5827		"focus",
5828		"frequency",
5829		"function",
5830		"gap_type",
5831		"gdb_xref",
5832		"gene_synonym",
5833		"gene",
5834		"germline",
5835		"haplogroup",
5836		"haplotype",
5837		"host",
5838		"identified_by",
5839		"inference",
5840		"insertion_seq",
5841		"isolate",
5842		"isolation_source",
5843		"kinetoplast",
5844		"lab_host",
5845		"label",
5846		"lat_lon",
5847		"linkage_evidence",
5848		"locus_tag",
5849		"macronuclear",
5850		"map",
5851		"mating_type",
5852		"metagenome_source",
5853		"metagenomic",
5854		"mitochondrion",
5855		"mobile_element_type",
5856		"mobile_element",
5857		"mod_base",
5858		"mol_type",
5859		"name",
5860		"nat_host",
5861		"ncRNA_class",
5862		"non_functional",
5863		"note",
5864		"number",
5865		"old_locus_tag",
5866		"operon",
5867		"organelle",
5868		"organism",
5869		"partial",
5870		"PCR_conditions",
5871		"PCR_primers",
5872		"peptide",
5873		"phenotype",
5874		"plasmid",
5875		"pop_variant",
5876		"product",
5877		"protein_id",
5878		"proviral",
5879		"pseudo",
5880		"pseudogene",
5881		"rearranged",
5882		"recombination_class",
5883		"region_name",
5884		"regulatory_class",
5885		"replace",
5886		"ribosomal_slippage",
5887		"rpt_family",
5888		"rpt_type",
5889		"rpt_unit_range",
5890		"rpt_unit_seq",
5891		"rpt_unit",
5892		"satellite",
5893		"segment",
5894		"sequenced_mol",
5895		"serotype",
5896		"serovar",
5897		"sex",
5898		"site_type",
5899		"specific_host",
5900		"specimen_voucher",
5901		"standard_name",
5902		"strain",
5903		"structural_class",
5904		"sub_clone",
5905		"sub_species",
5906		"sub_strain",
5907		"submitter_seqid",
5908		"tag_peptide",
5909		"tissue_lib",
5910		"tissue_type",
5911		"trans_splicing",
5912		"transcript_id",
5913		"transcription",
5914		"transgenic",
5915		"transl_except",
5916		"transl_table",
5917		"translation",
5918		"transposon",
5919		"type_material",
5920		"UniProtKB_evidence",
5921		"usedin",
5922		"variety",
5923		"virion",
5924	}
5925
5926	// legal INSDSeq XML fields
5927
5928	insdtags := []string{
5929		"INSDAltSeqData_items",
5930		"INSDAltSeqData",
5931		"INSDAltSeqItem_first-accn",
5932		"INSDAltSeqItem_gap-comment",
5933		"INSDAltSeqItem_gap-length",
5934		"INSDAltSeqItem_gap-linkage",
5935		"INSDAltSeqItem_gap-type",
5936		"INSDAltSeqItem_interval",
5937		"INSDAltSeqItem_isgap",
5938		"INSDAltSeqItem_isgap@value",
5939		"INSDAltSeqItem_last-accn",
5940		"INSDAltSeqItem_value",
5941		"INSDAltSeqItem",
5942		"INSDAuthor",
5943		"INSDComment_paragraphs",
5944		"INSDComment_type",
5945		"INSDComment",
5946		"INSDCommentParagraph",
5947		"INSDFeature_intervals",
5948		"INSDFeature_key",
5949		"INSDFeature_location",
5950		"INSDFeature_operator",
5951		"INSDFeature_partial3",
5952		"INSDFeature_partial3@value",
5953		"INSDFeature_partial5",
5954		"INSDFeature_partial5@value",
5955		"INSDFeature_quals",
5956		"INSDFeature_xrefs",
5957		"INSDFeature",
5958		"INSDFeatureSet_annot-source",
5959		"INSDFeatureSet_features",
5960		"INSDFeatureSet",
5961		"INSDInterval_accession",
5962		"INSDInterval_from",
5963		"INSDInterval_interbp",
5964		"INSDInterval_interbp@value",
5965		"INSDInterval_iscomp",
5966		"INSDInterval_iscomp@value",
5967		"INSDInterval_point",
5968		"INSDInterval_to",
5969		"INSDInterval",
5970		"INSDKeyword",
5971		"INSDQualifier_name",
5972		"INSDQualifier_value",
5973		"INSDQualifier",
5974		"INSDReference_authors",
5975		"INSDReference_consortium",
5976		"INSDReference_journal",
5977		"INSDReference_position",
5978		"INSDReference_pubmed",
5979		"INSDReference_reference",
5980		"INSDReference_remark",
5981		"INSDReference_title",
5982		"INSDReference_xref",
5983		"INSDReference",
5984		"INSDSecondary-accn",
5985		"INSDSeq_accession-version",
5986		"INSDSeq_alt-seq",
5987		"INSDSeq_comment-set",
5988		"INSDSeq_comment",
5989		"INSDSeq_contig",
5990		"INSDSeq_create-date",
5991		"INSDSeq_create-release",
5992		"INSDSeq_database-reference",
5993		"INSDSeq_definition",
5994		"INSDSeq_division",
5995		"INSDSeq_entry-version",
5996		"INSDSeq_feature-set",
5997		"INSDSeq_feature-table",
5998		"INSDSeq_keywords",
5999		"INSDSeq_length",
6000		"INSDSeq_locus",
6001		"INSDSeq_moltype",
6002		"INSDSeq_organism",
6003		"INSDSeq_other-seqids",
6004		"INSDSeq_primary-accession",
6005		"INSDSeq_primary",
6006		"INSDSeq_project",
6007		"INSDSeq_references",
6008		"INSDSeq_secondary-accessions",
6009		"INSDSeq_segment",
6010		"INSDSeq_sequence",
6011		"INSDSeq_source-db",
6012		"INSDSeq_source",
6013		"INSDSeq_strandedness",
6014		"INSDSeq_struc-comments",
6015		"INSDSeq_taxonomy",
6016		"INSDSeq_topology",
6017		"INSDSeq_update-date",
6018		"INSDSeq_update-release",
6019		"INSDSeq_xrefs",
6020		"INSDSeq",
6021		"INSDSeqid",
6022		"INSDSet",
6023		"INSDStrucComment_items",
6024		"INSDStrucComment_name",
6025		"INSDStrucComment",
6026		"INSDStrucCommentItem_tag",
6027		"INSDStrucCommentItem_url",
6028		"INSDStrucCommentItem_value",
6029		"INSDStrucCommentItem",
6030		"INSDXref_dbname",
6031		"INSDXref_id",
6032		"INSDXref",
6033	}
6034
6035	checkAgainstVocabulary := func(str, objtype string, arry []string) {
6036
6037		if str == "" || arry == nil {
6038			return
6039		}
6040
6041		// skip past pound, percent, or caret character at beginning of string
6042		if len(str) > 1 {
6043			switch str[0] {
6044			case '#', '%', '^':
6045				str = str[1:]
6046			default:
6047			}
6048		}
6049
6050		for _, txt := range arry {
6051			if str == txt {
6052				return
6053			}
6054			if strings.ToUpper(str) == strings.ToUpper(txt) {
6055				fmt.Fprintf(os.Stderr, "\nERROR: Incorrect capitalization of '%s' %s, change to '%s'\n", str, objtype, txt)
6056				os.Exit(1)
6057			}
6058		}
6059
6060		fmt.Fprintf(os.Stderr, "\nERROR: Item '%s' is not a legal -insd %s\n", str, objtype)
6061		os.Exit(1)
6062	}
6063
6064	var acc []string
6065
6066	max := len(args)
6067	if max < 1 {
6068		fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract -insd\n")
6069		os.Exit(1)
6070	}
6071
6072	if doIndex {
6073		if isPipe {
6074			acc = append(acc, "-head", "<IdxDocumentSet>", "-tail", "</IdxDocumentSet>")
6075			acc = append(acc, "-hd", "  <IdxDocument>\n", "-tl", "  </IdxDocument>")
6076			acc = append(acc, "-pattern", "INSDSeq", "-pfx", "    <IdxUid>", "-sfx", "</IdxUid>\n")
6077			acc = append(acc, "-element", "INSDSeq_accession-version", "-clr", "-rst", "-tab", "\n")
6078		} else {
6079			acc = append(acc, "-head", "\"<IdxDocumentSet>\"", "-tail", "\"</IdxDocumentSet>\"")
6080			acc = append(acc, "-hd", "\"  <IdxDocument>\\n\"", "-tl", "\"  </IdxDocument>\"")
6081			acc = append(acc, "-pattern", "INSDSeq", "-pfx", "\"    <IdxUid>\"", "-sfx", "\"</IdxUid>\\n\"")
6082			acc = append(acc, "-element", "INSDSeq_accession-version", "-clr", "-rst", "-tab", "\\n")
6083		}
6084	} else {
6085		acc = append(acc, "-pattern", "INSDSeq", "-ACCN", "INSDSeq_accession-version", "-SEQ", "INSDSeq_sequence")
6086	}
6087
6088	if doIndex {
6089		if isPipe {
6090			acc = append(acc, "-group", "INSDSeq", "-lbl", "    <IdxSearchFields>\n")
6091		} else {
6092			acc = append(acc, "-group", "INSDSeq", "-lbl", "\"    <IdxSearchFields>\\n\"")
6093		}
6094	}
6095
6096	printAccn := true
6097
6098	// collect descriptors
6099
6100	if strings.HasPrefix(args[0], "INSD") {
6101
6102		if doIndex {
6103			acc = append(acc, "-clr", "-indices")
6104		} else {
6105			if isPipe {
6106				acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN")
6107				acc = append(acc, "-group", "INSDSeq", "-sep", "|", "-element")
6108			} else {
6109				acc = append(acc, "-clr", "-pfx", "\"\\n\"", "-element", "\"&ACCN\"")
6110				acc = append(acc, "-group", "INSDSeq", "-sep", "\"|\"", "-element")
6111			}
6112			printAccn = false
6113		}
6114
6115		for {
6116			if len(args) < 1 {
6117				return acc
6118			}
6119			str := args[0]
6120			if !strings.HasPrefix(args[0], "INSD") {
6121				break
6122			}
6123			checkAgainstVocabulary(str, "element", insdtags)
6124			acc = append(acc, str)
6125			args = args[1:]
6126		}
6127
6128	} else if strings.HasPrefix(strings.ToUpper(args[0]), "INSD") {
6129
6130		// report capitalization or vocabulary failure
6131		checkAgainstVocabulary(args[0], "element", insdtags)
6132
6133		// program should not get to this point, but warn and exit anyway
6134		fmt.Fprintf(os.Stderr, "\nERROR: Item '%s' is not a legal -insd %s\n", args[0], "element")
6135		os.Exit(1)
6136	}
6137
6138	// collect qualifiers
6139
6140	partial := false
6141	complete := false
6142
6143	if args[0] == "+" || args[0] == "complete" {
6144		complete = true
6145		args = args[1:]
6146		max--
6147	} else if args[0] == "-" || args[0] == "partial" {
6148		partial = true
6149		args = args[1:]
6150		max--
6151	}
6152
6153	if max < 1 {
6154		fmt.Fprintf(os.Stderr, "\nERROR: No feature key supplied to xtract -insd\n")
6155		os.Exit(1)
6156	}
6157
6158	acc = append(acc, "-group", "INSDFeature")
6159
6160	// limit to designated features
6161
6162	feature := args[0]
6163
6164	fcmd := "-if"
6165
6166	// can specify multiple features separated by plus sign (e.g., CDS+mRNA) or comma (e.g., CDS,mRNA)
6167	plus := strings.Split(feature, "+")
6168	for _, pls := range plus {
6169		comma := strings.Split(pls, ",")
6170		for _, cma := range comma {
6171
6172			checkAgainstVocabulary(cma, "feature", features)
6173			acc = append(acc, fcmd, "INSDFeature_key", "-equals", cma)
6174
6175			fcmd = "-or"
6176		}
6177	}
6178
6179	if max < 2 {
6180		// still need at least one qualifier even on legal feature
6181		fmt.Fprintf(os.Stderr, "\nERROR: Feature '%s' must be followed by at least one qualifier\n", feature)
6182		os.Exit(1)
6183	}
6184
6185	args = args[1:]
6186
6187	if complete {
6188		acc = append(acc, "-unless", "INSDFeature_partial5", "-or", "INSDFeature_partial3")
6189	} else if partial {
6190		acc = append(acc, "-if", "INSDFeature_partial5", "-or", "INSDFeature_partial3")
6191	}
6192
6193	if printAccn {
6194		if doIndex {
6195		} else {
6196			if isPipe {
6197				acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN")
6198			} else {
6199				acc = append(acc, "-clr", "-pfx", "\"\\n\"", "-element", "\"&ACCN\"")
6200			}
6201		}
6202	}
6203
6204	for _, str := range args {
6205
6206		if str == "mol_wt" {
6207			str = "calculated_mol_wt"
6208		}
6209
6210		if strings.HasPrefix(str, "INSD") {
6211
6212			checkAgainstVocabulary(str, "element", insdtags)
6213			if doIndex {
6214				acc = append(acc, "-block", "INSDFeature", "-clr", "-indices")
6215			} else {
6216				if isPipe {
6217					acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
6218				} else {
6219					acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element")
6220				}
6221			}
6222			acc = append(acc, str)
6223			if addDash {
6224				acc = append(acc, "-block", "INSDFeature", "-unless", str)
6225				if strings.HasSuffix(str, "@value") {
6226					if isPipe {
6227						acc = append(acc, "-lbl", "false")
6228					} else {
6229						acc = append(acc, "-lbl", "\"false\"")
6230					}
6231				} else {
6232					if isPipe {
6233						acc = append(acc, "-lbl", "\\-")
6234					} else {
6235						acc = append(acc, "-lbl", "\"\\-\"")
6236					}
6237				}
6238			}
6239
6240		} else if strings.HasPrefix(str, "#INSD") {
6241
6242			checkAgainstVocabulary(str, "element", insdtags)
6243			if doIndex {
6244				acc = append(acc, "-block", "INSDFeature", "-clr", "-indices")
6245			} else {
6246				if isPipe {
6247					acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
6248					acc = append(acc, str)
6249				} else {
6250					acc = append(acc, "-block", "INSDFeature", "-sep", "\"|\"", "-element")
6251					ql := fmt.Sprintf("\"%s\"", str)
6252					acc = append(acc, ql)
6253				}
6254			}
6255
6256		} else if strings.HasPrefix(strings.ToUpper(str), "#INSD") {
6257
6258			// report capitalization or vocabulary failure
6259			checkAgainstVocabulary(str, "element", insdtags)
6260
6261		} else if str == "sub_sequence" {
6262
6263			// special sub_sequence qualifier shows sequence under feature intervals
6264			acc = append(acc, "-block", "INSDFeature_intervals")
6265
6266			acc = append(acc, "-subset", "INSDInterval", "-FR", "INSDInterval_from", "-TO", "INSDInterval_to")
6267			if isPipe {
6268				acc = append(acc, "-pfx", "", "-tab", "", "-nucleic", "&SEQ[&FR:&TO]")
6269			} else {
6270				acc = append(acc, "-pfx", "\"\"", "-tab", "\"\"", "-nucleic", "\"&SEQ[&FR:&TO]\"")
6271			}
6272
6273			acc = append(acc, "-subset", "INSDFeature_intervals")
6274			if isPipe {
6275				acc = append(acc, "-deq", "\\t")
6276			} else {
6277				acc = append(acc, "-deq", "\"\\t\"")
6278			}
6279
6280		} else if str == "feat_location" {
6281
6282			// special feat_location qualifier shows feature intervals
6283			acc = append(acc, "-block", "INSDFeature_intervals")
6284
6285			acc = append(acc, "-subset", "INSDInterval", "-FR", "INSDInterval_from", "-TO", "INSDInterval_to")
6286			if isPipe {
6287				acc = append(acc, "-pfx", "", "-tab", "..", "-element", "&FR")
6288				acc = append(acc, "-pfx", "", "-tab", ",", "-element", "&TO")
6289			} else {
6290				acc = append(acc, "-pfx", "\"\"", "-tab", "\"..\"", "-element", "\"&FR\"")
6291				acc = append(acc, "-pfx", "\"\"", "-tab", "\",\"", "-element", "\"&TO\"")
6292			}
6293
6294			acc = append(acc, "-subset", "INSDFeature_intervals")
6295			if isPipe {
6296				acc = append(acc, "-deq", "\\t")
6297			} else {
6298				acc = append(acc, "-deq", "\"\\t\"")
6299			}
6300
6301		} else if str == "chloroplast" ||
6302			str == "chromoplast" ||
6303			str == "cyanelle" ||
6304			str == "environmental_sample" ||
6305			str == "focus" ||
6306			str == "germline" ||
6307			str == "kinetoplast" ||
6308			str == "macronuclear" ||
6309			str == "metagenomic" ||
6310			str == "mitochondrion" ||
6311			str == "partial" ||
6312			str == "proviral" ||
6313			str == "pseudo" ||
6314			str == "rearranged" ||
6315			str == "ribosomal_slippage" ||
6316			str == "trans_splicing" ||
6317			str == "transgenic" ||
6318			str == "virion" {
6319
6320			acc = append(acc, "-block", "INSDQualifier")
6321
6322			checkAgainstVocabulary(str, "qualifier", qualifiers)
6323			if doIndex {
6324				acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
6325				acc = append(acc, "-clr", "-indices", "INSDQualifier_name")
6326			} else {
6327				acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
6328				acc = append(acc, "-lbl", str)
6329			}
6330			if addDash {
6331				acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str)
6332				if isPipe {
6333					acc = append(acc, "-lbl", "\\-")
6334				} else {
6335					acc = append(acc, "-lbl", "\"\\-\"")
6336				}
6337			}
6338
6339		} else {
6340
6341			acc = append(acc, "-block", "INSDQualifier")
6342
6343			checkAgainstVocabulary(str, "qualifier", qualifiers)
6344			if len(str) > 2 && str[0] == '%' {
6345				acc = append(acc, "-if", "INSDQualifier_name", "-equals", str[1:])
6346				if doIndex {
6347					if isPipe {
6348						acc = append(acc, "-clr", "-indices", "%INSDQualifier_value")
6349					} else {
6350						acc = append(acc, "-clr", "-indices", "\"%INSDQualifier_value\"")
6351					}
6352				} else {
6353					if isPipe {
6354						acc = append(acc, "-element", "%INSDQualifier_value")
6355					} else {
6356						acc = append(acc, "-element", "\"%INSDQualifier_value\"")
6357					}
6358				}
6359				if addDash {
6360					acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str[1:])
6361					if isPipe {
6362						acc = append(acc, "-lbl", "\\-")
6363					} else {
6364						acc = append(acc, "-lbl", "\"\\-\"")
6365					}
6366				}
6367			} else {
6368				if doIndex {
6369					acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
6370					acc = append(acc, "-clr", "-indices", "INSDQualifier_value")
6371				} else {
6372					acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
6373					acc = append(acc, "-element", "INSDQualifier_value")
6374				}
6375				if addDash {
6376					acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str)
6377					if isPipe {
6378						acc = append(acc, "-lbl", "\\-")
6379					} else {
6380						acc = append(acc, "-lbl", "\"\\-\"")
6381					}
6382				}
6383			}
6384		}
6385	}
6386
6387	if doIndex {
6388		if isPipe {
6389			acc = append(acc, "-group", "INSDSeq", "-clr", "-lbl", "    </IdxSearchFields>\n")
6390		} else {
6391			acc = append(acc, "-group", "INSDSeq", "-clr", "-lbl", "\"    </IdxSearchFields>\\n\"")
6392		}
6393	}
6394
6395	return acc
6396}
6397
6398// BIOTHINGS EXTRACTION COMMAND GENERATOR
6399
6400// processBiopath generates extraction commands for BioThings resources (undocumented)
6401func processBiopath(args []string, isPipe bool) []string {
6402
6403	// nquire -get "http://myvariant.info/v1/variant/chr6:g.26093141G>A" \
6404	//   -fields clinvar.rcv.conditions.identifiers \
6405	//   -always_list clinvar.rcv.conditions.identifiers |
6406	// transmute -j2x |
6407	// xtract -biopath opt clinvar.rcv.conditions.identifiers.omim
6408
6409	var acc []string
6410
6411	max := len(args)
6412	if max < 2 {
6413		fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract -biopath\n")
6414		os.Exit(1)
6415	}
6416
6417	obj := args[0]
6418	args = args[1:]
6419
6420	acc = append(acc, "-pattern", obj)
6421
6422	paths := args[0]
6423
6424	items := strings.Split(paths, ",")
6425
6426	for _, path := range items {
6427
6428		dirs := strings.Split(path, ".")
6429		max = len(dirs)
6430		if max < 1 {
6431			fmt.Fprintf(os.Stderr, "\nERROR: Insufficient path arguments supplied to xtract -biopath\n")
6432			os.Exit(1)
6433		}
6434		if max > 7 {
6435			fmt.Fprintf(os.Stderr, "\nERROR: Too many nodes in argument supplied to xtract -biopath\n")
6436			os.Exit(1)
6437		}
6438
6439		str := dirs[max-1]
6440
6441		acc = append(acc, "-path")
6442		if isPipe {
6443			acc = append(acc, path)
6444			acc = append(acc, "-tab", "\\n")
6445			acc = append(acc, "-element", str)
6446		} else {
6447			acc = append(acc, "\""+path+"\"")
6448			acc = append(acc, "-tab", "\"\\n\"")
6449			acc = append(acc, "-element", "\""+str+"\"")
6450		}
6451	}
6452
6453	return acc
6454}
6455
6456// HYDRA CITATION MATCHER COMMAND GENERATOR
6457
6458// processHydra generates extraction commands for NCBI's in-house citation matcher (undocumented)
6459func processHydra(isPipe bool) []string {
6460
6461	var acc []string
6462
6463	// acceptable scores are 0.8 or higher, exact match on "1" rejects low value in scientific notation with minus sign present
6464
6465	acc = append(acc, "-pattern", "Id")
6466	acc = append(acc, "-if", "@score", "-equals", "1")
6467	acc = append(acc, "-or", "@score", "-starts-with", "0.9")
6468	acc = append(acc, "-or", "@score", "-starts-with", "0.8")
6469	acc = append(acc, "-element", "Id")
6470
6471	return acc
6472}
6473
6474// ENTREZ2INDEX COMMAND GENERATOR
6475
6476// processE2Index generates extraction commands to create input for Entrez2Index
6477func processE2Index(args []string, tform string, isPipe bool) []string {
6478
6479	var acc []string
6480
6481	max := len(args)
6482	if max < 3 {
6483		fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to -e2index\n")
6484		os.Exit(1)
6485	}
6486
6487	year := ""
6488	patrn := args[0]
6489	args = args[1:]
6490
6491	isAllNumeric := func(str string) bool {
6492
6493		for _, ch := range str {
6494			if !unicode.IsDigit(ch) &&
6495				ch != '.' &&
6496				ch != '+' &&
6497				ch != '-' &&
6498				ch != '*' &&
6499				ch != '/' &&
6500				ch != ',' &&
6501				ch != '$' &&
6502				ch != '#' &&
6503				ch != '%' &&
6504				ch != '(' &&
6505				ch != ')' {
6506				return false
6507			}
6508		}
6509
6510		return true
6511	}
6512
6513	if isAllNumeric(patrn) {
6514		year = patrn
6515		patrn = args[0]
6516		args = args[1:]
6517	}
6518
6519	ident := args[0]
6520	args = args[1:]
6521
6522	if !isPipe {
6523		if !deStop {
6524			acc = append(acc, "-stops")
6525		}
6526		if doStem {
6527			acc = append(acc, "-stems")
6528		}
6529	}
6530
6531	if isPipe {
6532		acc = append(acc, "-head", "<IdxDocumentSet>", "-tail", "</IdxDocumentSet>")
6533		acc = append(acc, "-hd", "  <IdxDocument>\\n", "-tl", "  </IdxDocument>")
6534		acc = append(acc, "-pattern")
6535		acc = append(acc, patrn)
6536		if year != "" {
6537			acc = append(acc, "-if", "PubDate/Year", "-ge", year)
6538			acc = append(acc, "-or", "PubDate/MedlineDate[1:4]", "-ge", year)
6539		}
6540		acc = append(acc, "-pfx", "    <IdxUid>", "-sfx", "</IdxUid>\\n")
6541		acc = append(acc, "-element")
6542		acc = append(acc, ident)
6543		acc = append(acc, "-clr", "-rst", "-tab", "")
6544		acc = append(acc, "-lbl", "    <IdxSearchFields>\\n")
6545		acc = append(acc, "-pfx", "      <YEAR>", "-sfx", "</YEAR>\\n")
6546		acc = append(acc, "-year", "PubDate/*")
6547		acc = append(acc, "-clr", "-rst", "-tab", "")
6548		acc = append(acc, "-indices")
6549		for _, str := range args {
6550			acc = append(acc, str)
6551		}
6552		if tform != "" {
6553			acc = append(acc, "-clr", "-rst", "-tab", "\"\"")
6554			acc = append(acc, "-sep", ",", "-meshcode")
6555			acc = append(acc, "MeshHeading/DescriptorName@UI,Chemical/NameOfSubstance@UI,SupplMeshName@UI")
6556		}
6557		acc = append(acc, "-clr", "-lbl", "    </IdxSearchFields>\\n")
6558	} else {
6559		acc = append(acc, "-head", "\"<IdxDocumentSet>\"", "-tail", "\"</IdxDocumentSet>\"")
6560		acc = append(acc, "-hd", "\"  <IdxDocument>\\n\"", "-tl", "\"  </IdxDocument>\"")
6561		acc = append(acc, "-pattern")
6562		ql := fmt.Sprintf("\"%s\"", patrn)
6563		acc = append(acc, ql)
6564		if year != "" {
6565			acc = append(acc, "-if", "PubDate/Year", "-ge", year)
6566			acc = append(acc, "-or", "PubDate/MedlineDate[1:4]", "-ge", year)
6567		}
6568		acc = append(acc, "-pfx", "\"    <IdxUid>\"", "-sfx", "\"</IdxUid>\\n\"")
6569		acc = append(acc, "-element")
6570		ql = fmt.Sprintf("\"%s\"", ident)
6571		acc = append(acc, ql)
6572		acc = append(acc, "-clr", "-rst", "-tab", "\"\"")
6573		acc = append(acc, "-lbl", "\"    <IdxSearchFields>\\n\"")
6574		acc = append(acc, "-pfx", "\"      <YEAR>\"", "-sfx", "\"</YEAR>\\n\"")
6575		acc = append(acc, "-year", "\"PubDate/*\"")
6576		acc = append(acc, "-clr", "-rst", "-tab", "\"\"")
6577		acc = append(acc, "-indices")
6578		for _, str := range args {
6579			ql = fmt.Sprintf("\"%s\"", str)
6580			acc = append(acc, ql)
6581		}
6582		if tform != "" {
6583			acc = append(acc, "-clr", "-rst", "-tab", "\"\"")
6584			acc = append(acc, "-sep", "\",\"", "-meshcode")
6585			acc = append(acc, "\"MeshHeading/DescriptorName@UI,Chemical/NameOfSubstance@UI,SupplMeshName@UI\"")
6586		}
6587		acc = append(acc, "-clr", "-lbl", "\"    </IdxSearchFields>\\n\"")
6588	}
6589
6590	return acc
6591}
6592
6593// CONCURRENT CONSUMER GOROUTINES PARSE AND PROCESS PARTITIONED XML OBJECTS
6594
6595// StreamBlocks -> SplitPattern => XmlParse => StreamTokens => ProcessQuery -> MergeResults
6596
6597// processes with single goroutine call defer close(out) so consumer(s) can range over channel
6598// processes with multiple instances call defer wg.Done(), separate goroutine uses wg.Wait() to delay close(out)
6599
6600func createConsumers(cmds *Block, parent, hd, tl string, transform map[string]string, histogram map[string]int, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
6601
6602	if inp == nil {
6603		return nil
6604	}
6605
6606	out := make(chan eutils.XMLRecord, eutils.ChanDepth())
6607	if out == nil {
6608		fmt.Fprintf(os.Stderr, "\nERROR: Unable to create consumer channel\n")
6609		os.Exit(1)
6610	}
6611
6612	// xmlConsumer reads partitioned XML from channel and calls parser for processing
6613	xmlConsumer := func(cmds *Block, parent string, wg *sync.WaitGroup, inp <-chan eutils.XMLRecord, out chan<- eutils.XMLRecord) {
6614
6615		// report when this consumer has no more records to process
6616		defer wg.Done()
6617
6618		// read partitioned XML from producer channel
6619		for ext := range inp {
6620
6621			idx := ext.Index
6622			text := ext.Text
6623
6624			if text == "" {
6625				// should never see empty input data
6626				out <- eutils.XMLRecord{Index: idx, Text: text}
6627				continue
6628			}
6629
6630			str := processQuery(text[:], parent, idx, hd, tl, transform, histogram, cmds)
6631
6632			// send even if empty to get all record counts for reordering
6633			out <- eutils.XMLRecord{Index: idx, Text: str}
6634		}
6635	}
6636
6637	var wg sync.WaitGroup
6638
6639	// launch multiple consumer goroutines
6640	for i := 0; i < eutils.NumServe(); i++ {
6641		wg.Add(1)
6642		go xmlConsumer(cmds, parent, &wg, inp, out)
6643	}
6644
6645	// launch separate anonymous goroutine to wait until all consumers are done
6646	go func() {
6647		wg.Wait()
6648		close(out)
6649	}()
6650
6651	return out
6652}
6653
6654func createSelectors(parent, indx string, order map[string]bool, inp <-chan eutils.XMLRecord) <-chan eutils.XMLRecord {
6655
6656	if parent == "" || indx == "" || order == nil || inp == nil {
6657		return nil
6658	}
6659
6660	find := eutils.ParseIndex(indx)
6661
6662	out := make(chan eutils.XMLRecord, eutils.ChanDepth())
6663	if out == nil {
6664		fmt.Fprintf(os.Stderr, "\nERROR: Unable to create selector channel\n")
6665		os.Exit(1)
6666	}
6667
6668	// xmlSelector reads partitioned XML from channel and matches identifiers of records to keep
6669	xmlSelector := func(wg *sync.WaitGroup, inp <-chan eutils.XMLRecord, out chan<- eutils.XMLRecord) {
6670
6671		// report when this selector has no more records to process
6672		defer wg.Done()
6673
6674		// read partitioned XML from producer channel
6675		for ext := range inp {
6676
6677			text := ext.Text
6678
6679			found := false
6680
6681			eutils.FindIdentifiers(text[:], parent, find,
6682				func(id string) {
6683					id = sortStringByWords(id)
6684					_, ok := order[id]
6685					if ok {
6686						found = true
6687					}
6688				})
6689
6690			if !found {
6691				// identifier field not found or not in identifier list, send empty placeholder for unshuffler
6692				out <- eutils.XMLRecord{Index: ext.Index}
6693				continue
6694			}
6695
6696			// send selected record
6697			out <- eutils.XMLRecord{Index: ext.Index, Text: text}
6698		}
6699	}
6700
6701	var wg sync.WaitGroup
6702
6703	// launch multiple selector goroutines
6704	for i := 0; i < eutils.NumServe(); i++ {
6705		wg.Add(1)
6706		go xmlSelector(&wg, inp, out)
6707	}
6708
6709	// launch separate anonymous goroutine to wait until all selectors are done
6710	go func() {
6711		wg.Wait()
6712		close(out)
6713	}()
6714
6715	return out
6716}
6717
6718// MAIN FUNCTION
6719
6720// e.g., xtract -pattern PubmedArticle -element MedlineCitation/PMID -block Author -sep " " -element Initials,LastName
6721
6722func main() {
6723
6724	// skip past executable name
6725	args := os.Args[1:]
6726
6727	if len(args) < 1 {
6728		fmt.Fprintf(os.Stderr, "\nERROR: No command-line arguments supplied to xtract\n")
6729		os.Exit(1)
6730	}
6731
6732	// performance arguments
6733	chanDepth := 0
6734	farmSize := 0
6735	heapSize := 0
6736	numServe := 0
6737	goGc := 0
6738
6739	// processing option arguments
6740	doCompress := false
6741	doCleanup := false
6742	doStrict := false
6743	doMixed := false
6744	deAccent := false
6745	doASCII := false
6746	doStem = false
6747	deStop = true
6748
6749	/*
6750		doUnicode := false
6751		doScript := false
6752		doMathML := false
6753	*/
6754
6755	// CONCURRENCY, CLEANUP, AND DEBUGGING FLAGS
6756
6757	// do these first because -defcpu and -maxcpu can be sent from wrapper before other arguments
6758
6759	ncpu := runtime.NumCPU()
6760	if ncpu < 1 {
6761		ncpu = 1
6762	}
6763
6764	// wrapper can limit maximum number of processors to use (undocumented)
6765	maxProcs := 0
6766	defProcs := 0
6767
6768	// concurrent performance tuning parameters, can be overridden by -proc and -cons
6769	numProcs := 0
6770	serverRatio := 4
6771
6772	// -flag sets -strict or -mixed cleanup flags from argument
6773	flgs := ""
6774
6775	/*
6776		unicodePolicy := ""
6777		scriptPolicy := ""
6778		mathmlPolicy := ""
6779	*/
6780
6781	// read data from file instead of stdin
6782	fileName := ""
6783
6784	// debugging
6785	mpty := false
6786	idnt := false
6787	stts := false
6788	timr := false
6789
6790	// profiling
6791	prfl := false
6792
6793	// repeat the specified extraction 5 times for each -proc from 1 to nCPU
6794	trial := false
6795
6796	inSwitch := true
6797
6798	// get concurrency, cleanup, and debugging flags in any order
6799	for {
6800
6801		inSwitch = true
6802
6803		switch args[0] {
6804		// concurrency override arguments can be passed in by local wrapper script (undocumented)
6805		case "-maxcpu":
6806			maxProcs = eutils.GetNumericArg(args, "Maximum number of processors", 1, 1, ncpu)
6807			args = args[1:]
6808		case "-defcpu":
6809			defProcs = eutils.GetNumericArg(args, "Default number of processors", ncpu, 1, ncpu)
6810			args = args[1:]
6811		// performance tuning flags
6812		case "-proc":
6813			numProcs = eutils.GetNumericArg(args, "Number of processors", ncpu, 1, ncpu)
6814			args = args[1:]
6815		case "-cons":
6816			serverRatio = eutils.GetNumericArg(args, "Parser to processor ratio", 4, 1, 32)
6817			args = args[1:]
6818		case "-serv":
6819			numServe = eutils.GetNumericArg(args, "Concurrent parser count", 0, 1, 128)
6820			args = args[1:]
6821		case "-chan":
6822			chanDepth = eutils.GetNumericArg(args, "Communication channel depth", 0, ncpu, 128)
6823			args = args[1:]
6824		case "-heap":
6825			heapSize = eutils.GetNumericArg(args, "Unshuffler heap size", 8, 8, 64)
6826			args = args[1:]
6827		case "-farm":
6828			farmSize = eutils.GetNumericArg(args, "Node buffer length", 4, 4, 2048)
6829			args = args[1:]
6830		case "-gogc":
6831			goGc = eutils.GetNumericArg(args, "Garbage collection percentage", 0, 50, 1000)
6832			args = args[1:]
6833
6834		// read data from file
6835		case "-input":
6836			fileName = eutils.GetStringArg(args, "Input file name")
6837			args = args[1:]
6838
6839		// data cleanup flags
6840		case "-compress", "-compressed":
6841			doCompress = true
6842		case "-spaces", "-cleanup":
6843			doCleanup = true
6844		case "-strict":
6845			doStrict = true
6846		case "-mixed":
6847			doMixed = true
6848		case "-accent":
6849			deAccent = true
6850		case "-ascii":
6851			doASCII = true
6852
6853		// previously visible processing flags (undocumented)
6854		case "-stems", "-stem":
6855			doStem = true
6856		case "-stops", "-stop":
6857			deStop = false
6858
6859		// allow setting of unicode, script, and mathml flags (undocumented)
6860		case "-unicode":
6861			// unicodePolicy = GetStringArg(args, "Unicode argument")
6862			args = args[1:]
6863		case "-script":
6864			// scriptPolicy = GetStringArg(args, "Script argument")
6865			args = args[1:]
6866		case "-mathml":
6867			// mathmlPolicy = GetStringArg(args, "MathML argument")
6868			args = args[1:]
6869
6870		case "-flag", "-flags":
6871			flgs = eutils.GetStringArg(args, "Flags argument")
6872			args = args[1:]
6873
6874		// debugging flags
6875		case "-debug":
6876			// dbug = true
6877		case "-empty":
6878			mpty = true
6879		case "-ident":
6880			idnt = true
6881		case "-stats", "-stat":
6882			stts = true
6883		case "-timer":
6884			timr = true
6885		case "-profile":
6886			prfl = true
6887		case "-trial", "-trials":
6888			trial = true
6889
6890		default:
6891			// if not any of the controls, set flag to break out of for loop
6892			inSwitch = false
6893		}
6894
6895		if !inSwitch {
6896			break
6897		}
6898
6899		// skip past argument
6900		args = args[1:]
6901
6902		if len(args) < 1 {
6903			break
6904		}
6905	}
6906
6907	// -flag allows script to set -strict or -mixed (or -stems, or -stops) from argument
6908	switch flgs {
6909	case "strict":
6910		doStrict = true
6911	case "mixed":
6912		doMixed = true
6913	case "stems", "stem":
6914		doStem = true
6915	case "stops", "stop":
6916		deStop = false
6917	case "none", "default":
6918	default:
6919		if flgs != "" {
6920			fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized -flag value '%s'\n", flgs)
6921			os.Exit(1)
6922		}
6923	}
6924
6925	/*
6926		UnicodeFix = parseMarkup(unicodePolicy, "-unicode")
6927		ScriptFix = parseMarkup(scriptPolicy, "-script")
6928		MathMLFix = parseMarkup(mathmlPolicy, "-mathml")
6929
6930		if UnicodeFix != NOMARKUP {
6931			doUnicode = true
6932		}
6933
6934		if ScriptFix != NOMARKUP {
6935			doScript = true
6936		}
6937
6938		if MathMLFix != NOMARKUP {
6939			doMathML = true
6940		}
6941	*/
6942
6943	if numProcs == 0 {
6944		if defProcs > 0 {
6945			numProcs = defProcs
6946		} else if maxProcs > 0 {
6947			numProcs = maxProcs
6948		}
6949	}
6950	if numProcs > ncpu {
6951		numProcs = ncpu
6952	}
6953	if numProcs > maxProcs {
6954		numProcs = maxProcs
6955	}
6956
6957	eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc)
6958
6959	eutils.SetOptions(doStrict, doMixed, deAccent, doASCII, doCompress, doCleanup)
6960
6961	// -stats prints number of CPUs and performance tuning values if no other arguments (undocumented)
6962	if stts && len(args) < 1 {
6963
6964		eutils.PrintStats()
6965
6966		return
6967	}
6968
6969	if len(args) < 1 {
6970		fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n")
6971		os.Exit(1)
6972	}
6973
6974	// DOCUMENTATION COMMANDS
6975
6976	inSwitch = true
6977
6978	switch args[0] {
6979	case "-version":
6980		fmt.Printf("%s\n", eutils.EDirectVersion)
6981	case "-help":
6982		fmt.Printf("xtract %s\n%s\n", eutils.EDirectVersion, xtractHelp)
6983	case "-examples", "-example":
6984		ex, eerr := os.Executable()
6985		if eerr == nil {
6986			fmt.Printf("xtract %s\n\n", eutils.EDirectVersion)
6987			exPath := filepath.Dir(ex)
6988			fpath := path.Join(exPath, "hlp-xtract.txt")
6989			file, ferr := os.Open(fpath)
6990			if file != nil && ferr == nil {
6991				scanner := bufio.NewScanner(file)
6992				for scanner.Scan() {
6993					fmt.Println(scanner.Text())
6994				}
6995			}
6996			file.Close()
6997			fmt.Printf("\n")
6998		}
6999	case "-extras", "-extra", "-advanced":
7000		fmt.Printf("Please run rchive -help for local record indexing information\n")
7001	case "-internal", "-internals":
7002		fmt.Printf("xtract %s\n%s\n", eutils.EDirectVersion, xtractInternal)
7003	case "-keys":
7004		fmt.Printf("%s\n", keyboardShortcuts)
7005	case "-unix":
7006		fmt.Printf("%s\n", unixCommands)
7007	default:
7008		// if not any of the documentation commands, keep going
7009		inSwitch = false
7010	}
7011
7012	if inSwitch {
7013		return
7014	}
7015
7016	// FILE NAME CAN BE SUPPLIED WITH -input COMMAND
7017
7018	in := os.Stdin
7019
7020	// check for data being piped into stdin
7021	isPipe := false
7022	fi, err := os.Stdin.Stat()
7023	if err == nil {
7024		isPipe = bool((fi.Mode() & os.ModeNamedPipe) != 0)
7025	}
7026
7027	usingFile := false
7028
7029	if fileName != "" {
7030
7031		inFile, err := os.Open(fileName)
7032		if err != nil {
7033			fmt.Fprintf(os.Stderr, "\nERROR: Unable to open input file '%s'\n", fileName)
7034			os.Exit(1)
7035		}
7036
7037		defer inFile.Close()
7038
7039		// use indicated file instead of stdin
7040		in = inFile
7041		usingFile = true
7042
7043		if isPipe && runtime.GOOS != "windows" {
7044			mode := fi.Mode().String()
7045			fmt.Fprintf(os.Stderr, "\nERROR: Input data from both stdin and file '%s', mode is '%s'\n", fileName, mode)
7046			os.Exit(1)
7047		}
7048	}
7049
7050	// check for -input command after extraction arguments
7051	for _, str := range args {
7052		if str == "-input" {
7053			fmt.Fprintf(os.Stderr, "\nERROR: Misplaced -input command\n")
7054			os.Exit(1)
7055		}
7056	}
7057
7058	// START PROFILING IF REQUESTED
7059
7060	if prfl {
7061
7062		f, err := os.Create("cpu.pprof")
7063		if err != nil {
7064			fmt.Fprintf(os.Stderr, "\nERROR: Unable to create profile output file\n")
7065			os.Exit(1)
7066		}
7067
7068		pprof.StartCPUProfile(f)
7069
7070		defer pprof.StopCPUProfile()
7071	}
7072
7073	// INITIALIZE RECORD COUNT
7074
7075	recordCount := 0
7076	byteCount := 0
7077
7078	// print processing rate and program duration
7079	printDuration := func(name string) {
7080
7081		eutils.PrintDuration(name, recordCount, byteCount)
7082	}
7083
7084	// NAME OF OUTPUT STRING TRANSFORMATION FILE
7085
7086	tform := ""
7087	transform := make(map[string]string)
7088
7089	populateTx := func(tf string) {
7090
7091		inFile, err := os.Open(tf)
7092		if err != nil {
7093			fmt.Fprintf(os.Stderr, "Unable to open transformation file %s\n", err.Error())
7094			os.Exit(1)
7095		}
7096		defer inFile.Close()
7097
7098		scanr := bufio.NewScanner(inFile)
7099
7100		// populate transformation map for -translate (and -matrix) output
7101		for scanr.Scan() {
7102
7103			line := scanr.Text()
7104			frst, scnd := eutils.SplitInTwoLeft(line, "\t")
7105
7106			transform[frst] = scnd
7107		}
7108	}
7109
7110	if len(args) > 2 && args[0] == "-transform" {
7111		tform = args[1]
7112		args = args[2:]
7113		if tform != "" {
7114			populateTx(tform)
7115		}
7116	}
7117
7118	// CREATE XML BLOCK READER FROM STDIN OR FILE
7119
7120	rdr := eutils.CreateXMLStreamer(in)
7121	if rdr == nil {
7122		fmt.Fprintf(os.Stderr, "\nERROR: Unable to create XML Block Reader\n")
7123		os.Exit(1)
7124	}
7125
7126	// SEQUENCE RECORD EXTRACTION COMMAND GENERATOR
7127
7128	// -insd simplifies extraction of INSDSeq qualifiers
7129	if args[0] == "-insd" || args[0] == "-insd-" || args[0] == "-insd-idx" {
7130
7131		addDash := true
7132		doIndex := false
7133		// -insd- variant suppresses use of dash as placeholder for missing qualifiers (undocumented)
7134		if args[0] == "-insd-" {
7135			addDash = false
7136		}
7137		// -insd-idx variant creates word index using -indices command (undocumented)
7138		if args[0] == "-insd-idx" {
7139			doIndex = true
7140			addDash = false
7141		}
7142
7143		args = args[1:]
7144
7145		insd := processINSD(args, isPipe || usingFile, addDash, doIndex)
7146
7147		if !isPipe && !usingFile {
7148			// no piped input, so write output instructions
7149			fmt.Printf("xtract")
7150			for _, str := range insd {
7151				fmt.Printf(" %s", str)
7152			}
7153			fmt.Printf("\n")
7154			return
7155		}
7156
7157		// data in pipe, so replace arguments, execute dynamically
7158		args = insd
7159	}
7160
7161	// CITATION MATCHER EXTRACTION COMMAND GENERATOR
7162
7163	// -hydra filters HydraResponse output by relevance score (undocumented)
7164	if args[0] == "-hydra" {
7165
7166		hydra := processHydra(isPipe || usingFile)
7167
7168		if !isPipe && !usingFile {
7169			// no piped input, so write output instructions
7170			fmt.Printf("xtract")
7171			for _, str := range hydra {
7172				fmt.Printf(" %s", str)
7173			}
7174			fmt.Printf("\n")
7175			return
7176		}
7177
7178		// data in pipe, so replace arguments, execute dynamically
7179		args = hydra
7180	}
7181
7182	// BIOTHINGS EXTRACTION COMMAND GENERATOR
7183
7184	// -biopath takes a parent object and a dotted exploration path for BioThings resources (undocumented)
7185	if args[0] == "-biopath" {
7186
7187		args = args[1:]
7188
7189		biopath := processBiopath(args, isPipe || usingFile)
7190
7191		if !isPipe && !usingFile {
7192			// no piped input, so write output instructions
7193			fmt.Printf("xtract")
7194			for _, str := range biopath {
7195				fmt.Printf(" %s", str)
7196			}
7197			fmt.Printf("\n")
7198			return
7199		}
7200
7201		// data in pipe, so replace arguments, execute dynamically
7202		args = biopath
7203	}
7204
7205	// ENTREZ2INDEX COMMAND GENERATOR
7206
7207	// -e2index shortcut for experimental indexing code (documented in rchive.go)
7208	if args[0] == "-e2index" {
7209
7210		// e.g., xtract -transform "$EDIRECT_MESH_TREE" -e2index
7211
7212		args = args[1:]
7213
7214		if len(args) == 0 {
7215			// if no arguments, use default values
7216			args = []string{"PubmedArticle", "MedlineCitation/PMID", "ArticleTitle,Abstract/AbstractText"}
7217		}
7218
7219		// environment variable can override garbage collector (undocumented)
7220		gcEnv := os.Getenv("EDIRECT_INDEX_GOGC")
7221		if gcEnv != "" {
7222			val, err := strconv.Atoi(gcEnv)
7223			if err == nil {
7224				if val >= 50 && val <= 1000 {
7225					debug.SetGCPercent(val)
7226				} else {
7227					debug.SetGCPercent(100)
7228				}
7229			}
7230		}
7231
7232		// environment variable can override number of servers (undocumented)
7233		svEnv := os.Getenv("EDIRECT_INDEX_SERV")
7234		if svEnv != "" {
7235			val, err := strconv.Atoi(svEnv)
7236			if err == nil {
7237				if val >= 1 && val <= 128 {
7238					numServe = val
7239				} else {
7240					numServe = 1
7241				}
7242				eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc)
7243			}
7244		}
7245
7246		res := processE2Index(args, tform, isPipe || usingFile)
7247
7248		if !isPipe && !usingFile {
7249			// no piped input, so write output instructions
7250			fmt.Printf("xtract")
7251			if tform != "" {
7252				fmt.Printf(" -transform %s", tform)
7253			}
7254			for _, str := range res {
7255				fmt.Printf(" %s", str)
7256			}
7257			fmt.Printf("\n")
7258			return
7259		}
7260
7261		// data in pipe, so replace arguments, execute dynamically
7262		args = res
7263	}
7264
7265	// CONFIRM INPUT DATA AVAILABILITY AFTER RUNNING COMMAND GENERATORS
7266
7267	if fileName == "" && runtime.GOOS != "windows" {
7268
7269		fromStdin := bool((fi.Mode() & os.ModeCharDevice) == 0)
7270		if !isPipe || !fromStdin {
7271			mode := fi.Mode().String()
7272			fmt.Fprintf(os.Stderr, "\nERROR: No data supplied to xtract from stdin or file, mode is '%s'\n", mode)
7273			os.Exit(1)
7274		}
7275	}
7276
7277	if !usingFile && !isPipe {
7278
7279		fmt.Fprintf(os.Stderr, "\nERROR: No XML input data supplied to xtract\n")
7280		os.Exit(1)
7281	}
7282
7283	// XML VALIDATION
7284
7285	nextArg := func() (string, bool) {
7286
7287		if len(args) < 1 {
7288			return "", false
7289		}
7290
7291		// remove next token from slice
7292		nxt := args[0]
7293		args = args[1:]
7294
7295		return nxt, true
7296	}
7297
7298	if args[0] == "-verify" || args[0] == "-validate" {
7299
7300		// skip past command name
7301		args = args[1:]
7302
7303		find := ""
7304		html := false
7305
7306		// look for optional arguments
7307		for {
7308			arg, ok := nextArg()
7309			if !ok {
7310				break
7311			}
7312
7313			switch arg {
7314			case "-find":
7315				// override set wrapper
7316				find, ok = nextArg()
7317			case "-html":
7318				html = true
7319			}
7320		}
7321
7322		recordCount = eutils.ValidateXML(rdr, find, html)
7323
7324		debug.FreeOSMemory()
7325
7326		// suppress printing of lines if not properly counted
7327		if recordCount == 1 {
7328			recordCount = 0
7329		}
7330
7331		if timr {
7332			printDuration("lines")
7333		}
7334
7335		return
7336	}
7337
7338	// MISCELLANEOUS TIMING COMMANDS
7339
7340	if args[0] == "-chunk" {
7341
7342		for str := range rdr {
7343			recordCount++
7344			byteCount += len(str)
7345		}
7346
7347		printDuration("blocks")
7348
7349		return
7350	}
7351
7352	if args[0] == "-split" {
7353
7354		if len(args) > 1 {
7355			if args[1] == "-pattern" {
7356				// skip past -split if followed by -pattern
7357				args = args[1:]
7358			}
7359		}
7360		if len(args) < 2 {
7361			fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -split command\n")
7362			os.Exit(1)
7363		}
7364		pat := args[1]
7365
7366		eutils.PartitionPattern(pat, "", rdr,
7367			func(str string) {
7368				recordCount++
7369				byteCount += len(str)
7370			})
7371
7372		printDuration("patterns")
7373
7374		return
7375	}
7376
7377	if args[0] == "-token" {
7378
7379		eutils.StreamTokens(rdr,
7380			func(tkn eutils.XMLToken) {
7381				recordCount++
7382				byteCount += len(tkn.Name) + len(tkn.Attr)
7383			})
7384
7385		printDuration("tokens")
7386
7387		return
7388	}
7389
7390	// SPECIFY STRINGS TO GO BEFORE AND AFTER ENTIRE OUTPUT OR EACH RECORD
7391
7392	head := ""
7393	tail := ""
7394
7395	hd := ""
7396	tl := ""
7397
7398	for {
7399
7400		inSwitch = true
7401
7402		switch args[0] {
7403		case "-head":
7404			if len(args) < 2 {
7405				fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -head command\n")
7406				os.Exit(1)
7407			}
7408			head = eutils.ConvertSlash(args[1])
7409			// allow splitting of -head argument, keep appending until next command (undocumented)
7410			ofs, nxt := 0, args[2:]
7411			for {
7412				if len(nxt) < 1 {
7413					break
7414				}
7415				tmp := nxt[0]
7416				if strings.HasPrefix(tmp, "-") {
7417					break
7418				}
7419				ofs++
7420				txt := eutils.ConvertSlash(tmp)
7421				if head != "" && !strings.HasSuffix(head, "\t") {
7422					head += "\t"
7423				}
7424				head += txt
7425				nxt = nxt[1:]
7426			}
7427			if ofs > 0 {
7428				args = args[ofs:]
7429			}
7430		case "-tail":
7431			if len(args) < 2 {
7432				fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -tail command\n")
7433				os.Exit(1)
7434			}
7435			tail = eutils.ConvertSlash(args[1])
7436		case "-hd":
7437			if len(args) < 2 {
7438				fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -hd command\n")
7439				os.Exit(1)
7440			}
7441			hd = eutils.ConvertSlash(args[1])
7442		case "-tl":
7443			if len(args) < 2 {
7444				fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -tl command\n")
7445				os.Exit(1)
7446			}
7447			tl = eutils.ConvertSlash(args[1])
7448		case "-wrp":
7449			// shortcut to wrap records in XML tags
7450			if len(args) < 2 {
7451				fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -wrp command\n")
7452				os.Exit(1)
7453			}
7454			tmp := eutils.ConvertSlash(args[1])
7455			lft, rgt := eutils.SplitInTwoLeft(tmp, ",")
7456			if lft != "" {
7457				head = "<" + lft + ">"
7458				tail = "</" + lft + ">"
7459			}
7460			if rgt != "" {
7461				hd = "<" + rgt + ">"
7462				tl = "</" + rgt + ">"
7463			}
7464		case "-set":
7465			if len(args) < 2 {
7466				fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -set command\n")
7467				os.Exit(1)
7468			}
7469			tmp := eutils.ConvertSlash(args[1])
7470			if tmp != "" {
7471				head = "<" + tmp + ">"
7472				tail = "</" + tmp + ">"
7473			}
7474		case "-rec":
7475			if len(args) < 2 {
7476				fmt.Fprintf(os.Stderr, "\nERROR: Pattern missing after -rec command\n")
7477				os.Exit(1)
7478			}
7479			tmp := eutils.ConvertSlash(args[1])
7480			if tmp != "" {
7481				hd = "<" + tmp + ">"
7482				tl = "</" + tmp + ">"
7483			}
7484		default:
7485			// if not any of the controls, set flag to break out of for loop
7486			inSwitch = false
7487		}
7488
7489		if !inSwitch {
7490			break
7491		}
7492
7493		// skip past arguments
7494		args = args[2:]
7495
7496		if len(args) < 1 {
7497			fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n")
7498			os.Exit(1)
7499		}
7500	}
7501
7502	// ENSURE PRESENCE OF PATTERN ARGUMENT
7503
7504	if len(args) < 1 {
7505		fmt.Fprintf(os.Stderr, "\nERROR: Insufficient command-line arguments supplied to xtract\n")
7506		os.Exit(1)
7507	}
7508
7509	// allow -record as synonym of -pattern (undocumented)
7510	if args[0] == "-record" || args[0] == "-Record" {
7511		args[0] = "-pattern"
7512	}
7513
7514	// make sure top-level -pattern command is next
7515	if args[0] != "-pattern" && args[0] != "-Pattern" {
7516		fmt.Fprintf(os.Stderr, "\nERROR: No -pattern in command-line arguments\n")
7517		os.Exit(1)
7518	}
7519	if len(args) < 2 {
7520		fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n")
7521		os.Exit(1)
7522	}
7523
7524	topPat := args[1]
7525	if topPat == "" {
7526		fmt.Fprintf(os.Stderr, "\nERROR: Item missing after -pattern command\n")
7527		os.Exit(1)
7528	}
7529	if strings.HasPrefix(topPat, "-") {
7530		fmt.Fprintf(os.Stderr, "\nERROR: Misplaced %s command\n", topPat)
7531		os.Exit(1)
7532	}
7533
7534	// look for -pattern Parent/* construct for heterogeneous data, e.g., -pattern PubmedArticleSet/*
7535	topPattern, star := eutils.SplitInTwoLeft(topPat, "/")
7536	if topPattern == "" {
7537		return
7538	}
7539
7540	parent := ""
7541	if star == "*" {
7542		parent = topPattern
7543	} else if star != "" {
7544		fmt.Fprintf(os.Stderr, "\nERROR: -pattern Parent/Child construct is not supported\n")
7545		os.Exit(1)
7546	}
7547
7548	// READ FILE OF IDENTIFIERS AND CONCURRENTLY EXTRACT SELECTED RECORDS
7549
7550	// -pattern record_name -select parent/element@attribute^version -in file_of_identifiers
7551	if len(args) == 6 && args[2] == "-select" && (args[4] == "-in" || args[4] == "-retaining") {
7552
7553		indx := args[3]
7554		unqe := args[5]
7555
7556		// read file of identifiers to use for filtering
7557		fl, err := os.Open(unqe)
7558		if err != nil {
7559			fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", unqe)
7560			os.Exit(1)
7561		}
7562
7563		// create map that records each UID
7564		order := make(map[string]bool)
7565
7566		scanr := bufio.NewScanner(fl)
7567
7568		// read lines of identifiers
7569		for scanr.Scan() {
7570
7571			line := scanr.Text()
7572			id, _ := eutils.SplitInTwoLeft(line, "\t")
7573
7574			id = sortStringByWords(id)
7575
7576			// add identifier to map
7577			order[id] = true
7578		}
7579
7580		fl.Close()
7581
7582		xmlq := eutils.CreateXMLProducer(topPattern, star, rdr)
7583		fchq := createSelectors(topPattern, indx, order, xmlq)
7584		unsq := eutils.CreateXMLUnshuffler(fchq)
7585
7586		if xmlq == nil || fchq == nil || unsq == nil {
7587			fmt.Fprintf(os.Stderr, "\nERROR: Unable to create selector\n")
7588			os.Exit(1)
7589		}
7590
7591		if head != "" {
7592			os.Stdout.WriteString(head)
7593			os.Stdout.WriteString("\n")
7594		}
7595
7596		// drain output channel
7597		for curr := range unsq {
7598
7599			str := curr.Text
7600
7601			if str == "" {
7602				continue
7603			}
7604
7605			if hd != "" {
7606				os.Stdout.WriteString(hd)
7607				os.Stdout.WriteString("\n")
7608			}
7609
7610			// send result to output
7611			os.Stdout.WriteString(str)
7612			if !strings.HasSuffix(str, "\n") {
7613				os.Stdout.WriteString("\n")
7614			}
7615
7616			if tl != "" {
7617				os.Stdout.WriteString(tl)
7618				os.Stdout.WriteString("\n")
7619			}
7620
7621			recordCount++
7622			runtime.Gosched()
7623		}
7624
7625		if tail != "" {
7626			os.Stdout.WriteString(tail)
7627			os.Stdout.WriteString("\n")
7628		}
7629
7630		debug.FreeOSMemory()
7631
7632		if timr {
7633			printDuration("records")
7634		}
7635
7636		return
7637	}
7638
7639	// READ FILE OF IDENTIFIERS AND EXCLUDE SELECTED RECORDS
7640
7641	// -pattern record_name -exclude element -excluding file_of_identifiers (undocumented)
7642	if len(args) == 6 && args[2] == "-select" && args[4] == "-excluding" {
7643
7644		indx := args[3]
7645		unqe := args[5]
7646
7647		// read file of identifiers to use for filtering
7648		fl, err := os.Open(unqe)
7649		if err != nil {
7650			fmt.Fprintf(os.Stderr, "\nERROR: Unable to open identifier file '%s'\n", unqe)
7651			os.Exit(1)
7652		}
7653
7654		// create map that records each UID
7655		order := make(map[string]bool)
7656
7657		scanr := bufio.NewScanner(fl)
7658
7659		// read lines of identifiers
7660		for scanr.Scan() {
7661
7662			line := scanr.Text()
7663			id, _ := eutils.SplitInTwoLeft(line, "\t")
7664			id = strings.ToLower(id)
7665
7666			// add identifier to map
7667			order[id] = true
7668		}
7669
7670		fl.Close()
7671
7672		find := eutils.ParseIndex(indx)
7673
7674		if head != "" {
7675			os.Stdout.WriteString(head)
7676			os.Stdout.WriteString("\n")
7677		}
7678
7679		eutils.PartitionPattern(topPattern, star, rdr,
7680			func(str string) {
7681				recordCount++
7682
7683				id := eutils.FindIdentifier(str[:], parent, find)
7684				if id != "" {
7685					id = strings.ToLower(id)
7686					_, ok := order[id]
7687					if ok {
7688						// in exclusion list, skip
7689						return
7690					}
7691				}
7692
7693				if hd != "" {
7694					os.Stdout.WriteString(hd)
7695					os.Stdout.WriteString("\n")
7696				}
7697
7698				// write selected record
7699				os.Stdout.WriteString(str[:])
7700				os.Stdout.WriteString("\n")
7701
7702				if tl != "" {
7703					os.Stdout.WriteString(tl)
7704					os.Stdout.WriteString("\n")
7705				}
7706			})
7707
7708		if tail != "" {
7709			os.Stdout.WriteString(tail)
7710			os.Stdout.WriteString("\n")
7711		}
7712
7713		debug.FreeOSMemory()
7714
7715		if timr {
7716			printDuration("records")
7717		}
7718
7719		return
7720	}
7721
7722	// READ ORDERED FILE OF IDENTIFIERS AND XML STRINGS, APPEND XML JUST INSIDE CLOSING TAG OF APPROPRIATE RECORD
7723
7724	// -pattern record_name -select element -appending file_of_identifiers_and_metadata (undocumented)
7725	if len(args) == 6 && args[2] == "-select" && args[4] == "-appending" {
7726
7727		indx := args[3]
7728		apnd := args[5]
7729
7730		fl, err := os.Open(apnd)
7731		if err != nil {
7732			fmt.Fprintf(os.Stderr, "\nERROR: Unable to open transformation file '%s'\n", apnd)
7733			os.Exit(1)
7734		}
7735
7736		scanr := bufio.NewScanner(fl)
7737
7738		find := eutils.ParseIndex(indx)
7739
7740		if head != "" {
7741			os.Stdout.WriteString(head)
7742			os.Stdout.WriteString("\n")
7743		}
7744
7745		rgt := "</" + topPattern + ">"
7746
7747		eutils.PartitionPattern(topPattern, star, rdr,
7748			func(str string) {
7749				recordCount++
7750
7751				id := eutils.FindIdentifier(str[:], parent, find)
7752				if id == "" {
7753					return
7754				}
7755				id = strings.ToLower(id)
7756
7757				for scanr.Scan() {
7758
7759					line := scanr.Text()
7760					frst, scnd := eutils.SplitInTwoLeft(line, "\t")
7761					frst = strings.ToLower(frst)
7762
7763					if id != frst {
7764						return
7765					}
7766					if !strings.HasSuffix(str, rgt) {
7767						return
7768					}
7769
7770					lft := strings.TrimSuffix(str, rgt)
7771					str = lft + "  " + scnd + "\n" + rgt
7772
7773					if hd != "" {
7774						os.Stdout.WriteString(hd)
7775						os.Stdout.WriteString("\n")
7776					}
7777
7778					os.Stdout.WriteString(str[:])
7779					os.Stdout.WriteString("\n")
7780
7781					if tl != "" {
7782						os.Stdout.WriteString(tl)
7783						os.Stdout.WriteString("\n")
7784					}
7785
7786					break
7787				}
7788			})
7789
7790		if tail != "" {
7791			os.Stdout.WriteString(tail)
7792			os.Stdout.WriteString("\n")
7793		}
7794
7795		fl.Close()
7796
7797		debug.FreeOSMemory()
7798
7799		if timr {
7800			printDuration("records")
7801		}
7802
7803		return
7804	}
7805
7806	// SORT XML RECORDS BY IDENTIFIER
7807
7808	// -pattern record_name -sort parent/element@attribute^version
7809	if len(args) == 4 && args[2] == "-sort" {
7810
7811		indx := args[3]
7812
7813		// create map that records each UID
7814		order := make(map[string][]string)
7815
7816		find := eutils.ParseIndex(indx)
7817
7818		eutils.PartitionPattern(topPattern, star, rdr,
7819			func(str string) {
7820				recordCount++
7821
7822				id := eutils.FindIdentifier(str[:], parent, find)
7823				if id == "" {
7824					return
7825				}
7826
7827				data, ok := order[id]
7828				if !ok {
7829					data = make([]string, 0, 1)
7830				}
7831				data = append(data, str)
7832				// always need to update order, since data may be reallocated
7833				order[id] = data
7834			})
7835
7836		var keys []string
7837		for ky := range order {
7838			keys = append(keys, ky)
7839		}
7840		// sort fields in alphabetical or numeric order
7841		sort.Slice(keys, func(i, j int) bool {
7842			// numeric sort on strings checks lengths first
7843			if eutils.IsAllDigits(keys[i]) && eutils.IsAllDigits(keys[j]) {
7844				lni := len(keys[i])
7845				lnj := len(keys[j])
7846				// shorter string is numerically less, assuming no leading zeros
7847				if lni < lnj {
7848					return true
7849				}
7850				if lni > lnj {
7851					return false
7852				}
7853			}
7854			// same length or non-numeric, can now do string comparison on contents
7855			return keys[i] < keys[j]
7856		})
7857
7858		if head != "" {
7859			os.Stdout.WriteString(head)
7860			os.Stdout.WriteString("\n")
7861		}
7862
7863		for _, id := range keys {
7864
7865			strs := order[id]
7866			for _, str := range strs {
7867				os.Stdout.WriteString(str)
7868				os.Stdout.WriteString("\n")
7869			}
7870		}
7871
7872		if tail != "" {
7873			os.Stdout.WriteString(tail)
7874			os.Stdout.WriteString("\n")
7875		}
7876
7877		debug.FreeOSMemory()
7878
7879		if timr {
7880			printDuration("records")
7881		}
7882
7883		return
7884	}
7885
7886	// SPLIT FILE BY BY RECORD COUNT
7887
7888	// split XML record into subfiles by count
7889	if len(args) == 8 && args[2] == "-split" && args[4] == "-prefix" && args[6] == "-suffix" {
7890
7891		// e.g., -head "<IdxDocumentSet>" -tail "</IdxDocumentSet>" -pattern IdxDocument -split 250000 -prefix "biocon" -suffix "e2x"
7892		count := 0
7893		fnum := 0
7894		var (
7895			fl  *os.File
7896			err error
7897		)
7898		chunk, err := strconv.Atoi(args[3])
7899		if err != nil {
7900			fmt.Fprintf(os.Stderr, "%s\n", err.Error())
7901			return
7902		}
7903		prefix := args[5]
7904		suffix := args[7]
7905
7906		eutils.PartitionPattern(topPattern, star, rdr,
7907			func(str string) {
7908				recordCount++
7909
7910				if count >= chunk {
7911					if tail != "" {
7912						fl.WriteString(tail)
7913						fl.WriteString("\n")
7914					}
7915					fl.Close()
7916					count = 0
7917				}
7918				if count == 0 {
7919					fpath := fmt.Sprintf("%s%03d.%s", prefix, fnum, suffix)
7920					fl, err = os.Create(fpath)
7921					if err != nil {
7922						fmt.Fprintf(os.Stderr, "%s\n", err.Error())
7923						return
7924					}
7925					os.Stderr.WriteString(fpath + "\n")
7926					fnum++
7927					if head != "" {
7928						fl.WriteString(head)
7929						fl.WriteString("\n")
7930					}
7931				}
7932				count++
7933
7934				fl.WriteString(str[:])
7935				fl.WriteString("\n")
7936			})
7937
7938		if count >= chunk {
7939			if tail != "" {
7940				fl.WriteString(tail)
7941				fl.WriteString("\n")
7942			}
7943			fl.Close()
7944		}
7945
7946		debug.FreeOSMemory()
7947
7948		if timr {
7949			printDuration("records")
7950		}
7951
7952		return
7953	}
7954
7955	// PARSE AND VALIDATE EXTRACTION ARGUMENTS
7956
7957	// parse nested exploration instruction from command-line arguments
7958	cmds := parseArguments(args, topPattern)
7959	if cmds == nil {
7960		fmt.Fprintf(os.Stderr, "\nERROR: Problem parsing command-line arguments\n")
7961		os.Exit(1)
7962	}
7963
7964	// GLOBAL MAP FOR SORT-UNIQ-COUNT HISTOGRAM ARGUMENT
7965
7966	histogram := make(map[string]int)
7967
7968	// PERFORMANCE TIMING COMMAND
7969
7970	// -stats with an extraction command prints XML size and processing time for each record
7971	if stts {
7972
7973		legend := "REC\tOFST\tSIZE\tTIME"
7974
7975		rec := 0
7976
7977		eutils.PartitionPattern(topPattern, star, rdr,
7978			func(str string) {
7979				rec++
7980				beginTime := time.Now()
7981				processQuery(str[:], parent, rec, hd, tl, transform, histogram, cmds)
7982				endTime := time.Now()
7983				duration := endTime.Sub(beginTime)
7984				micro := int(float64(duration.Nanoseconds()) / 1e3)
7985				if legend != "" {
7986					fmt.Printf("%s\n", legend)
7987					legend = ""
7988				}
7989				fmt.Printf("%d\t%d\t%d\n", rec, len(str), micro)
7990			})
7991
7992		return
7993	}
7994
7995	// PERFORMANCE OPTIMIZATION FUNCTION
7996
7997	// -trial -input fileName runs the specified extraction for each -proc from 1 to nCPU
7998	if trial && fileName != "" {
7999
8000		legend := "CPU\tRATE\tDEV"
8001
8002		for numServ := 1; numServ <= ncpu; numServ++ {
8003
8004			numServe = numServ
8005
8006			eutils.SetTunings(numProcs, numServe, serverRatio, chanDepth, farmSize, heapSize, goGc)
8007
8008			runtime.GOMAXPROCS(numServ)
8009
8010			sum := 0
8011			count := 0
8012			mean := 0.0
8013			m2 := 0.0
8014
8015			// calculate mean and standard deviation of processing rate
8016			for trials := 0; trials < 5; trials++ {
8017
8018				inFile, err := os.Open(fileName)
8019				if err != nil {
8020					fmt.Fprintf(os.Stderr, "\nERROR: Unable to open input file '%s'\n", fileName)
8021					os.Exit(1)
8022				}
8023
8024				trdr := eutils.CreateXMLStreamer(inFile)
8025				if trdr == nil {
8026					fmt.Fprintf(os.Stderr, "\nERROR: Unable to read input file\n")
8027					os.Exit(1)
8028				}
8029
8030				xmlq := eutils.CreateXMLProducer(topPattern, star, trdr)
8031				tblq := createConsumers(cmds, parent, hd, tl, transform, histogram, xmlq)
8032
8033				if xmlq == nil || tblq == nil {
8034					fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n")
8035					os.Exit(1)
8036				}
8037
8038				begTime := time.Now()
8039				recordCount = 0
8040
8041				for range tblq {
8042					recordCount++
8043					runtime.Gosched()
8044				}
8045
8046				inFile.Close()
8047
8048				debug.FreeOSMemory()
8049
8050				endTime := time.Now()
8051				expended := endTime.Sub(begTime)
8052				secs := float64(expended.Nanoseconds()) / 1e9
8053
8054				if secs >= 0.000001 && recordCount > 0 {
8055					speed := int(float64(recordCount) / secs)
8056					sum += speed
8057					count++
8058					x := float64(speed)
8059					delta := x - mean
8060					mean += delta / float64(count)
8061					m2 += delta * (x - mean)
8062				}
8063			}
8064
8065			if legend != "" {
8066				fmt.Printf("%s\n", legend)
8067				legend = ""
8068			}
8069			if count > 1 {
8070				vrc := m2 / float64(count-1)
8071				dev := int(math.Sqrt(vrc))
8072				fmt.Printf("%d\t%d\t%d\n", numServ, sum/count, dev)
8073			}
8074		}
8075
8076		return
8077	}
8078
8079	// PROCESS SINGLE SELECTED RECORD IF -pattern ARGUMENT IS IMMEDIATELY FOLLOWED BY -position COMMAND
8080
8081	posn := ""
8082	if cmds.Visit == topPat {
8083		if cmds.Position == "outer" ||
8084			cmds.Position == "inner" ||
8085			cmds.Position == "even" ||
8086			cmds.Position == "odd" ||
8087			cmds.Position == "all" {
8088			// filter by record position when draining unshuffler channel
8089			posn = cmds.Position
8090			cmds.Position = ""
8091		}
8092	}
8093
8094	if cmds.Visit == topPat && cmds.Position != "" && cmds.Position != "select" {
8095
8096		qry := ""
8097		idx := 0
8098		rec := 0
8099
8100		if cmds.Position == "first" {
8101
8102			eutils.PartitionPattern(topPattern, star, rdr,
8103				func(str string) {
8104					rec++
8105					if rec == 1 {
8106						qry = str
8107						idx = rec
8108					}
8109				})
8110
8111		} else if cmds.Position == "last" {
8112
8113			eutils.PartitionPattern(topPattern, star, rdr,
8114				func(str string) {
8115					qry = str
8116					idx = rec
8117				})
8118
8119		} else {
8120
8121			// use numeric position
8122			number, err := strconv.Atoi(cmds.Position)
8123			if err != nil {
8124				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized position '%s'\n", cmds.Position)
8125				os.Exit(1)
8126			}
8127
8128			eutils.PartitionPattern(topPattern, star, rdr,
8129				func(str string) {
8130					rec++
8131					if rec == number {
8132						qry = str
8133						idx = rec
8134					}
8135				})
8136		}
8137
8138		if qry == "" {
8139			return
8140		}
8141
8142		// clear position on top node to prevent condition test failure
8143		cmds.Position = ""
8144
8145		// process single selected record
8146		res := processQuery(qry[:], parent, idx, hd, tl, transform, histogram, cmds)
8147
8148		if res != "" {
8149			fmt.Printf("%s", res)
8150		}
8151
8152		return
8153	}
8154
8155	// LAUNCH PRODUCER, CONSUMER, AND UNSHUFFLER GOROUTINES
8156
8157	// launch producer goroutine to partition XML by pattern
8158	xmlq := eutils.CreateXMLProducer(topPattern, star, rdr)
8159
8160	// launch consumer goroutines to parse and explore partitioned XML objects
8161	tblq := createConsumers(cmds, parent, hd, tl, transform, histogram, xmlq)
8162
8163	// launch unshuffler goroutine to restore order of results
8164	unsq := eutils.CreateXMLUnshuffler(tblq)
8165
8166	if xmlq == nil || tblq == nil || unsq == nil {
8167		fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n")
8168		os.Exit(1)
8169	}
8170
8171	// PERFORMANCE SUMMARY
8172
8173	/*
8174		if dbug {
8175
8176			// drain results, but suppress extraction output
8177			for ext := range unsq {
8178				byteCount += len(ext.Text)
8179				recordCount++
8180				runtime.Gosched()
8181			}
8182
8183			// force garbage collection, return memory to operating system
8184			debug.FreeOSMemory()
8185
8186			// print processing parameters as XML object
8187			stopTime := time.Now()
8188			duration := stopTime.Sub(StartTime)
8189			seconds := float64(duration.Nanoseconds()) / 1e9
8190
8191			// Threads is a more easily explained concept than GOMAXPROCS
8192			fmt.Printf("<Xtract>\n")
8193			fmt.Printf("  <Threads>%d</Threads>\n", numProcs)
8194			fmt.Printf("  <Parsers>%d</Parsers>\n", NumServe)
8195			fmt.Printf("  <Time>%.3f</Time>\n", seconds)
8196			if seconds >= 0.001 && recordCount > 0 {
8197				rate := int(float64(recordCount) / seconds)
8198				fmt.Printf("  <Rate>%d</Rate>\n", rate)
8199			}
8200			fmt.Printf("</Xtract>\n")
8201
8202			return
8203		}
8204	*/
8205
8206	// DRAIN OUTPUT CHANNEL TO EXECUTE EXTRACTION COMMANDS, RESTORE OUTPUT ORDER WITH HEAP
8207
8208	var buffer strings.Builder
8209	count := 0
8210	okay := false
8211
8212	wrtr := bufio.NewWriter(os.Stdout)
8213
8214	// printResult prints output for current pattern, handles -empty and -ident flags, and periodically flushes buffer
8215	printResult := func(curr eutils.XMLRecord) {
8216
8217		str := curr.Text
8218
8219		if mpty {
8220
8221			if str == "" {
8222
8223				okay = true
8224
8225				idx := curr.Index
8226				val := strconv.Itoa(idx)
8227				buffer.WriteString(val[:])
8228				buffer.WriteString("\n")
8229
8230				count++
8231			}
8232
8233		} else if str != "" {
8234
8235			okay = true
8236
8237			if idnt {
8238				idx := curr.Index
8239				val := strconv.Itoa(idx)
8240				buffer.WriteString(val[:])
8241				buffer.WriteString("\t")
8242			}
8243
8244			// save output to byte buffer
8245			buffer.WriteString(str[:])
8246
8247			count++
8248		}
8249
8250		if count > 1000 {
8251			count = 0
8252			txt := buffer.String()
8253			if txt != "" {
8254				// print current buffer
8255				wrtr.WriteString(txt[:])
8256			}
8257			buffer.Reset()
8258		}
8259	}
8260
8261	if head != "" {
8262		buffer.WriteString(head[:])
8263		buffer.WriteString("\n")
8264	}
8265
8266	// drain unshuffler channel
8267
8268	if posn == "outer" {
8269
8270		// print only first and last records
8271		var beg *eutils.XMLRecord
8272		var end *eutils.XMLRecord
8273
8274		for curr := range unsq {
8275
8276			if beg == nil {
8277				beg = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text}
8278			} else {
8279				end = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text}
8280			}
8281
8282			recordCount++
8283		}
8284
8285		if beg != nil {
8286			printResult(*beg)
8287		}
8288		if end != nil {
8289			printResult(*end)
8290		}
8291
8292	} else if posn == "inner" {
8293
8294		// print all but first and last records
8295		var prev *eutils.XMLRecord
8296		var next *eutils.XMLRecord
8297		first := true
8298
8299		for curr := range unsq {
8300
8301			if first {
8302				first = false
8303			} else {
8304				prev = next
8305				next = &eutils.XMLRecord{Index: curr.Index, Ident: curr.Ident, Text: curr.Text}
8306			}
8307
8308			if prev != nil {
8309				printResult(*prev)
8310			}
8311
8312			recordCount++
8313		}
8314
8315	} else if posn == "even" {
8316
8317		even := false
8318
8319		for curr := range unsq {
8320
8321			if even {
8322				printResult(curr)
8323			}
8324			even = !even
8325
8326			recordCount++
8327		}
8328
8329	} else if posn == "odd" {
8330
8331		odd := true
8332
8333		for curr := range unsq {
8334
8335			if odd {
8336				printResult(curr)
8337			}
8338			odd = !odd
8339
8340			recordCount++
8341		}
8342
8343	} else {
8344
8345		// default or -position all
8346		for curr := range unsq {
8347
8348			// send result to output
8349			printResult(curr)
8350
8351			recordCount++
8352			runtime.Gosched()
8353		}
8354	}
8355
8356	if tail != "" {
8357		buffer.WriteString(tail[:])
8358		buffer.WriteString("\n")
8359	}
8360
8361	// do not print head or tail if no extraction output
8362	if okay {
8363		txt := buffer.String()
8364		if txt != "" {
8365			// print final buffer
8366			wrtr.WriteString(txt[:])
8367		}
8368	}
8369	buffer.Reset()
8370
8371	wrtr.Flush()
8372
8373	// print -histogram results, if populated
8374	var keys []string
8375	for ky := range histogram {
8376		keys = append(keys, ky)
8377	}
8378	if len(keys) > 0 {
8379		// sort fields in alphabetical or numeric order
8380		sort.Slice(keys, func(i, j int) bool {
8381			// numeric sort on strings checks lengths first
8382			if eutils.IsAllDigits(keys[i]) && eutils.IsAllDigits(keys[j]) {
8383				lni := len(keys[i])
8384				lnj := len(keys[j])
8385				// shorter string is numerically less, assuming no leading zeros
8386				if lni < lnj {
8387					return true
8388				}
8389				if lni > lnj {
8390					return false
8391				}
8392			}
8393			// same length or non-numeric, can now do string comparison on contents
8394			return keys[i] < keys[j]
8395		})
8396
8397		for _, str := range keys {
8398
8399			count := histogram[str]
8400			val := strconv.Itoa(count)
8401			os.Stdout.WriteString(val)
8402			os.Stdout.WriteString("\t")
8403			os.Stdout.WriteString(str)
8404			os.Stdout.WriteString("\n")
8405		}
8406	}
8407
8408	// force garbage collection and return memory before calculating processing rate
8409	debug.FreeOSMemory()
8410
8411	if timr {
8412		printDuration("records")
8413	}
8414}
8415