1// ===========================================================================
2//
3//                            PUBLIC DOMAIN NOTICE
4//            National Center for Biotechnology Information (NCBI)
5//
6//  This software/database is a "United States Government Work" under the
7//  terms of the United States Copyright Act. It was written as part of
8//  the author's official duties as a United States Government employee and
9//  thus cannot be copyrighted. This software/database is freely available
10//  to the public for use. The National Library of Medicine and the U.S.
11//  Government do not place any restriction on its use or reproduction.
12//  We would, however, appreciate having the NCBI and the author cited in
13//  any work or product based on this material.
14//
15//  Although all reasonable efforts have been taken to ensure the accuracy
16//  and reliability of the software and data, the NLM and the U.S.
17//  Government do not and cannot warrant the performance or results that
18//  may be obtained by using this software or data. The NLM and the U.S.
19//  Government disclaim all warranties, express or implied, including
20//  warranties of performance, merchantability or fitness for any particular
21//  purpose.
22//
23// ===========================================================================
24//
25// File Name:  hgvs.go
26//
27// Author:  Jonathan Kans
28//
29// ==========================================================================
30
31package eutils
32
33import (
34	"fmt"
35	"os"
36	"regexp"
37	"sort"
38	"strconv"
39	"strings"
40)
41
42var (
43	accnRegEx *regexp.Regexp
44	genRegEx  *regexp.Regexp
45	protRegEx *regexp.Regexp
46)
47
48// integer table for HGVS class used to define sort order
49const (
50	_ = iota
51	GENOMIC
52	CODING
53	NONCODING
54	MITOCONDRIAL
55	RNA
56	PROTEIN
57)
58
59var hgvsClass = map[int]string{
60	GENOMIC:      "Genomic",
61	CODING:       "Coding",
62	NONCODING:    "Noncoding",
63	MITOCONDRIAL: "Mitochondrial",
64	RNA:          "RNA",
65	PROTEIN:      "Protein",
66}
67
68// integer table for HGVS type used to define sort order
69const (
70	_ = iota
71	SUBS
72	MIS
73	TRM
74	EXT
75	SYN
76	DEL
77	INV
78	DUP
79	INS
80	CONV
81	INDEL
82	FS
83	TRANS
84	REP
85	MULT
86)
87
88var hgvsType = map[int]string{
89	SUBS:  "Substitution",
90	MIS:   "Missense",
91	TRM:   "Termination",
92	EXT:   "Extension",
93	SYN:   "Synonymous",
94	DEL:   "Deletion",
95	INV:   "Inversion",
96	DUP:   "Duplication",
97	INS:   "Insertion",
98	CONV:  "Conversion",
99	INDEL: "Indel",
100	FS:    "Frameshift",
101	TRANS: "Translocation",
102	REP:   "Repetitive",
103	MULT:  "Multiple",
104}
105
106// SPDI accession subfields used for controlling sort order
107type SPDI struct {
108	Class     int
109	Type      int
110	Accession string
111	Prefix    string
112	Digits    string
113	Number    int
114	Version   int
115	Position  int
116	Deleted   string
117	Inserted  string
118}
119
120var naConvert = map[string]string{
121	"-": "-",
122	"a": "A",
123	"c": "C",
124	"g": "G",
125	"t": "T",
126	"u": "T",
127}
128
129var aaConvert = map[string]string{
130	"-":   "-",
131	"*":   "*",
132	"a":   "A",
133	"b":   "B",
134	"c":   "C",
135	"d":   "D",
136	"e":   "E",
137	"f":   "F",
138	"g":   "G",
139	"h":   "H",
140	"i":   "I",
141	"j":   "J",
142	"k":   "K",
143	"l":   "L",
144	"m":   "M",
145	"n":   "N",
146	"o":   "O",
147	"p":   "P",
148	"q":   "Q",
149	"r":   "R",
150	"s":   "S",
151	"t":   "T",
152	"u":   "U",
153	"v":   "V",
154	"w":   "W",
155	"x":   "X",
156	"y":   "Y",
157	"z":   "Z",
158	"ala": "A",
159	"arg": "R",
160	"asn": "N",
161	"asp": "D",
162	"asx": "B",
163	"cys": "C",
164	"gap": "-",
165	"gln": "Q",
166	"glu": "E",
167	"glx": "Z",
168	"gly": "G",
169	"his": "H",
170	"ile": "I",
171	"leu": "L",
172	"lys": "K",
173	"met": "M",
174	"phe": "F",
175	"pro": "P",
176	"pyl": "O",
177	"sec": "U",
178	"ser": "S",
179	"stp": "*",
180	"ter": "*",
181	"thr": "T",
182	"trp": "W",
183	"tyr": "Y",
184	"val": "V",
185	"xle": "J",
186	"xxx": "X",
187}
188
189// ParseHGVS parses sequence variant format into XML
190func ParseHGVS(str string) string {
191
192	// initialize variation description regular expressions
193	if accnRegEx == nil {
194		accnRegEx = regexp.MustCompile("\\*|\\d+|\\.|\\D+")
195	}
196	if genRegEx == nil {
197		genRegEx = regexp.MustCompile("\\d+|\\D|>|\\D")
198	}
199	if protRegEx == nil {
200		protRegEx = regexp.MustCompile("\\*|\\d+|\\D+")
201	}
202
203	// track highest version per accession
204	highest := make(map[string]int)
205
206	hasNM := false
207	hasNP := false
208
209	// parse substitution
210	parseSubs := func(cls, acc, vrn string) *SPDI {
211
212		pos := ""
213		ins := ""
214		del := ""
215		class := 0
216		htype := 0
217		ok := false
218
219		// parse variation string
220		switch cls {
221		case "g":
222			class = GENOMIC
223
224			arry := genRegEx.FindAllString(vrn, -1)
225			if len(arry) != 4 {
226				// fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variant '%s', array '%v'\n", vrn, arry)
227				return nil
228			}
229
230			pos = arry[0]
231
232			lf := strings.ToLower(arry[1])
233			rt := strings.ToLower(arry[3])
234
235			del, ok = naConvert[lf]
236			if !ok {
237				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized nucleotide '%s'\n", lf)
238				return nil
239			}
240			ins, ok = naConvert[rt]
241			if !ok {
242				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized nucleotide '%s'\n", rt)
243				return nil
244			}
245
246			htype = SUBS
247		case "c":
248			class = CODING
249
250			arry := genRegEx.FindAllString(vrn, -1)
251			if len(arry) != 4 {
252				// fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variant '%s', array '%v'\n", vrn, arry)
253				return nil
254			}
255
256			// pos starts at the first nucleotide of the translation initiation codon
257			pos = arry[0]
258
259			lf := strings.ToLower(arry[1])
260			rt := strings.ToLower(arry[3])
261
262			del, ok = naConvert[lf]
263			if !ok {
264				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized nucleotide '%s'\n", lf)
265				return nil
266			}
267			ins, ok = naConvert[rt]
268			if !ok {
269				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized nucleotide '%s'\n", rt)
270				return nil
271			}
272
273			htype = SUBS
274		case "p":
275			class = PROTEIN
276
277			arry := protRegEx.FindAllString(vrn, -1)
278			if len(arry) != 3 {
279				// fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variant '%s', array '%v'\n", vrn, arry)
280				return nil
281			}
282
283			pos = arry[1]
284
285			lf := strings.ToLower(arry[0])
286			rt := strings.ToLower(arry[2])
287
288			del, ok = aaConvert[lf]
289			if !ok {
290				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized amino acid '%s'\n", lf)
291				return nil
292			}
293			ins, ok = aaConvert[rt]
294			if !ok {
295				fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized amino acid '%s'\n", rt)
296				return nil
297			}
298
299			htype = MIS
300			if ins == "*" {
301				htype = TRM
302			} else if del == "*" {
303				htype = EXT
304			} else if del == ins {
305				htype = SYN
306			}
307		default:
308			// fmt.Fprintf(os.Stderr, "\nWARNING: Parsing of HGVS class '%s' not yet implemented\n", cls)
309			return nil
310		}
311
312		// position should be unsigned integer
313		if !IsAllDigits(pos) {
314			fmt.Fprintf(os.Stderr, "\nERROR: Non-integer position '%s'\n", pos)
315			return nil
316		}
317
318		num, err := strconv.Atoi(pos)
319		if err != nil {
320			fmt.Fprintf(os.Stderr, "\nERROR: Integer conversion error '%s'\n", err)
321			return nil
322		}
323		// adjust position to 0-based
324		num--
325
326		spdi := &SPDI{Class: class, Type: htype, Accession: acc, Position: num, Deleted: del, Inserted: ins}
327
328		return spdi
329	}
330
331	// reality checks on variant type, will eventually call the different type-specific parsers
332	parseOneType := func(cls, acc, vrn string) *SPDI {
333
334		if strings.Index(vrn, "delins") >= 0 || strings.Index(vrn, "indel") >= 0 {
335			// fmt.Fprintf(os.Stderr, "\nWARNING: deletion-insertion not yet implemented\n")
336			return nil
337		}
338		if strings.Index(vrn, "del") >= 0 {
339			// fmt.Fprintf(os.Stderr, "\nWARNING: deletion not yet implemented\n")
340			return nil
341		}
342		if strings.Index(vrn, "inv") >= 0 {
343			// fmt.Fprintf(os.Stderr, "\nWARNING: inversion not yet implemented\n")
344			return nil
345		}
346		if strings.Index(vrn, "dup") >= 0 {
347			// fmt.Fprintf(os.Stderr, "\nWARNING: duplication not yet implemented\n")
348			return nil
349		}
350		if strings.Index(vrn, "con") >= 0 {
351			// fmt.Fprintf(os.Stderr, "\nWARNING: conversion not yet implemented\n")
352			return nil
353		}
354		if strings.Index(vrn, "ins") >= 0 {
355			// fmt.Fprintf(os.Stderr, "\nWARNING: insertion not yet implemented\n")
356			return nil
357		}
358		if strings.Index(vrn, "fs") >= 0 {
359			// fmt.Fprintf(os.Stderr, "\nWARNING: frameshift not yet implemented\n")
360			return nil
361		}
362		if strings.Index(vrn, "*") >= 0 {
363			// fmt.Fprintf(os.Stderr, "\nWARNING: UTR variant not yet implemented\n")
364			return nil
365		}
366		if strings.Index(vrn, "?") >= 0 {
367			// fmt.Fprintf(os.Stderr, "\nWARNING: predictions not yet implemented\n")
368			return nil
369		}
370		if strings.Index(vrn, ";") >= 0 {
371			// fmt.Fprintf(os.Stderr, "\nWARNING: variation pairs not yet implemented\n")
372			return nil
373		}
374		if strings.Index(vrn, "/") >= 0 {
375			// fmt.Fprintf(os.Stderr, "\nWARNING: variation pairs not yet implemented\n")
376			return nil
377		}
378		if strings.Index(vrn, "(") >= 0 || strings.Index(vrn, ")") >= 0 {
379			// fmt.Fprintf(os.Stderr, "\nWARNING: intron variation not yet implemented\n")
380			return nil
381		}
382		if strings.Index(vrn, "[") >= 0 || strings.Index(vrn, "]") >= 0 || strings.Index(vrn, ";") >= 0 {
383			// fmt.Fprintf(os.Stderr, "\nWARNING: repetitive stretch not yet implemented\n")
384			return nil
385		}
386
387		// otherwise use substitution
388		return parseSubs(cls, acc, vrn)
389	}
390
391	var buffer strings.Builder
392
393	var spdis []*SPDI
394
395	ok := false
396
397	// trim prefix and suffix
398	str = strings.TrimPrefix(str, "HGVS=")
399	idx := strings.Index(str, "|")
400	if idx >= 0 {
401		str = str[:idx]
402	}
403
404	// separate at commas
405	htgs := strings.Split(str, ",")
406
407	for _, htg := range htgs {
408
409		// skip empty item
410		if htg == "" {
411			continue
412		}
413
414		// extract accession
415		acc, rgt := SplitInTwoLeft(htg, ":")
416		if acc == "" || rgt == "" {
417			// fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse HGVS '%s'\n", htg)
418			continue
419		}
420		// split into type and variation
421		cls, vrn := SplitInTwoLeft(rgt, ".")
422		if cls == "" || vrn == "" {
423			// fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse HGVS '%s'\n", rgt)
424			continue
425		}
426
427		// normalize variant string
428		vrn = strings.TrimSpace(vrn)
429		vrn = strings.ToLower(vrn)
430
431		// predicted protein change enclosed in parentheses
432		vrn = strings.TrimPrefix(vrn, "(")
433		vrn = strings.TrimSuffix(vrn, ")")
434
435		spdi := parseOneType(cls, acc, vrn)
436		if spdi == nil {
437			continue
438		}
439
440		// get accession fields for sorting
441		pfx := ""
442		num := ""
443		ver := "0"
444
445		// use regular expression to handle XX_ and XX_ABCD accessions
446		arry := accnRegEx.FindAllString(acc, -1)
447		if len(arry) == 2 {
448			pfx = strings.ToUpper(arry[0])
449			num = strings.ToLower(arry[1])
450		} else if len(arry) == 4 {
451			pfx = strings.ToUpper(arry[0])
452			num = strings.ToLower(arry[1])
453			ver = strings.ToLower(arry[3])
454			if arry[2] != "." {
455				fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse version '%s', arry '%v'\n", acc, arry)
456				continue
457			}
458		} else {
459			fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse accession '%s', arry '%v'\n", acc, arry)
460			continue
461		}
462
463		if pfx == "" || num == "" {
464			fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse accession '%s'\n", acc)
465			continue
466		}
467
468		// RefSeq accession body should be unsigned integer
469		if !IsAllDigits(num) {
470			fmt.Fprintf(os.Stderr, "\nERROR: Non-integer accession body '%s'\n", num)
471			continue
472		}
473
474		val, err := strconv.Atoi(num)
475		if err != nil {
476			fmt.Fprintf(os.Stderr, "\nERROR: Integer conversion error '%s'\n", err)
477			continue
478		}
479
480		vsn, err := strconv.Atoi(ver)
481		if err != nil {
482			fmt.Fprintf(os.Stderr, "\nERROR: Version error '%s'\n", err)
483			continue
484		}
485
486		spdi.Prefix = pfx
487		spdi.Digits = num
488		spdi.Number = val
489		spdi.Version = vsn
490
491		// record highest version for each accession
492		unver := pfx + num
493		vr, found := highest[unver]
494		if !found {
495			highest[unver] = vsn
496		} else if vr > vsn {
497			highest[unver] = vr
498		}
499
500		if strings.HasPrefix(pfx, "NM_") {
501			hasNM = true
502		}
503		if strings.HasPrefix(pfx, "NP_") {
504			hasNP = true
505		}
506
507		spdis = append(spdis, spdi)
508		ok = true
509	}
510
511	if !ok {
512		return ""
513	}
514
515	sort.Slice(spdis, func(i, j int) bool {
516		a := spdis[i]
517		b := spdis[j]
518		if a.Class != b.Class {
519			return a.Class < b.Class
520		}
521		if a.Type != b.Type {
522			return a.Type < b.Type
523		}
524		if a.Prefix != b.Prefix {
525			return a.Prefix < b.Prefix
526		}
527		// numeric comparison of accession digits ignores leading zeros
528		if a.Number != b.Number {
529			return a.Number < b.Number
530		}
531		// most recent version goes first
532		if a.Version != b.Version {
533			return a.Version > b.Version
534		}
535		if a.Position != b.Position {
536			return a.Position < b.Position
537		}
538		if a.Deleted != b.Deleted {
539			return a.Deleted < b.Deleted
540		}
541		if a.Inserted != b.Inserted {
542			return a.Inserted < b.Inserted
543		}
544		return false
545	})
546
547	lastClass := ""
548
549	for _, itm := range spdis {
550
551		// skip earlier accession versions
552		unver := itm.Prefix + itm.Digits
553		vr, found := highest[unver]
554		if found && vr > itm.Version {
555			continue
556		}
557
558		// skip XM_ and XP_ if NM_ or NP_, respectively, are already present
559		if hasNM && strings.HasPrefix(itm.Accession, "XM_") {
560			continue
561		}
562		if hasNP && strings.HasPrefix(itm.Accession, "XP_") {
563			continue
564		}
565
566		clss := hgvsClass[itm.Class]
567		htyp := hgvsType[itm.Type]
568
569		if lastClass != clss {
570			if lastClass != "" {
571				buffer.WriteString("</" + lastClass + ">\n")
572			}
573			lastClass = clss
574			buffer.WriteString("<" + lastClass + ">\n")
575		}
576
577		// buffer.WriteString("<" + htyp + ">\n")
578
579		pos := strconv.Itoa(itm.Position)
580		lbl := "Position"
581		if itm.Class == CODING {
582			lbl = "Offset"
583		}
584		buffer.WriteString("<Variant>" +
585			"<Type>" + htyp + "</Type>" +
586			"<Accession>" + itm.Accession + "</Accession>" +
587			"<" + lbl + ">" + pos + "</" + lbl + ">" +
588			"<Deleted>" + itm.Deleted + "</Deleted>" +
589			"<Inserted>" + itm.Inserted + "</Inserted>" +
590			"</Variant>\n")
591
592		// buffer.WriteString("</" + htyp + ">\n")
593	}
594
595	if lastClass != "" {
596		buffer.WriteString("</" + lastClass + ">\n")
597	}
598
599	return buffer.String()
600}
601