1// =========================================================================== 2// 3// PUBLIC DOMAIN NOTICE 4// National Center for Biotechnology Information (NCBI) 5// 6// This software/database is a "United States Government Work" under the 7// terms of the United States Copyright Act. It was written as part of 8// the author's official duties as a United States Government employee and 9// thus cannot be copyrighted. This software/database is freely available 10// to the public for use. The National Library of Medicine and the U.S. 11// Government do not place any restriction on its use or reproduction. 12// We would, however, appreciate having the NCBI and the author cited in 13// any work or product based on this material. 14// 15// Although all reasonable efforts have been taken to ensure the accuracy 16// and reliability of the software and data, the NLM and the U.S. 17// Government do not and cannot warrant the performance or results that 18// may be obtained by using this software or data. The NLM and the U.S. 19// Government disclaim all warranties, express or implied, including 20// warranties of performance, merchantability or fitness for any particular 21// purpose. 22// 23// =========================================================================== 24// 25// File Name: hgvs.go 26// 27// Author: Jonathan Kans 28// 29// ========================================================================== 30 31package eutils 32 33import ( 34 "fmt" 35 "os" 36 "regexp" 37 "sort" 38 "strconv" 39 "strings" 40) 41 42var ( 43 accnRegEx *regexp.Regexp 44 genRegEx *regexp.Regexp 45 protRegEx *regexp.Regexp 46) 47 48// integer table for HGVS class used to define sort order 49const ( 50 _ = iota 51 GENOMIC 52 CODING 53 NONCODING 54 MITOCONDRIAL 55 RNA 56 PROTEIN 57) 58 59var hgvsClass = map[int]string{ 60 GENOMIC: "Genomic", 61 CODING: "Coding", 62 NONCODING: "Noncoding", 63 MITOCONDRIAL: "Mitochondrial", 64 RNA: "RNA", 65 PROTEIN: "Protein", 66} 67 68// integer table for HGVS type used to define sort order 69const ( 70 _ = iota 71 SUBS 72 MIS 73 TRM 74 EXT 75 SYN 76 DEL 77 INV 78 DUP 79 INS 80 CONV 81 INDEL 82 FS 83 TRANS 84 REP 85 MULT 86) 87 88var hgvsType = map[int]string{ 89 SUBS: "Substitution", 90 MIS: "Missense", 91 TRM: "Termination", 92 EXT: "Extension", 93 SYN: "Synonymous", 94 DEL: "Deletion", 95 INV: "Inversion", 96 DUP: "Duplication", 97 INS: "Insertion", 98 CONV: "Conversion", 99 INDEL: "Indel", 100 FS: "Frameshift", 101 TRANS: "Translocation", 102 REP: "Repetitive", 103 MULT: "Multiple", 104} 105 106// SPDI accession subfields used for controlling sort order 107type SPDI struct { 108 Class int 109 Type int 110 Accession string 111 Prefix string 112 Digits string 113 Number int 114 Version int 115 Position int 116 Deleted string 117 Inserted string 118} 119 120var naConvert = map[string]string{ 121 "-": "-", 122 "a": "A", 123 "c": "C", 124 "g": "G", 125 "t": "T", 126 "u": "T", 127} 128 129var aaConvert = map[string]string{ 130 "-": "-", 131 "*": "*", 132 "a": "A", 133 "b": "B", 134 "c": "C", 135 "d": "D", 136 "e": "E", 137 "f": "F", 138 "g": "G", 139 "h": "H", 140 "i": "I", 141 "j": "J", 142 "k": "K", 143 "l": "L", 144 "m": "M", 145 "n": "N", 146 "o": "O", 147 "p": "P", 148 "q": "Q", 149 "r": "R", 150 "s": "S", 151 "t": "T", 152 "u": "U", 153 "v": "V", 154 "w": "W", 155 "x": "X", 156 "y": "Y", 157 "z": "Z", 158 "ala": "A", 159 "arg": "R", 160 "asn": "N", 161 "asp": "D", 162 "asx": "B", 163 "cys": "C", 164 "gap": "-", 165 "gln": "Q", 166 "glu": "E", 167 "glx": "Z", 168 "gly": "G", 169 "his": "H", 170 "ile": "I", 171 "leu": "L", 172 "lys": "K", 173 "met": "M", 174 "phe": "F", 175 "pro": "P", 176 "pyl": "O", 177 "sec": "U", 178 "ser": "S", 179 "stp": "*", 180 "ter": "*", 181 "thr": "T", 182 "trp": "W", 183 "tyr": "Y", 184 "val": "V", 185 "xle": "J", 186 "xxx": "X", 187} 188 189// ParseHGVS parses sequence variant format into XML 190func ParseHGVS(str string) string { 191 192 // initialize variation description regular expressions 193 if accnRegEx == nil { 194 accnRegEx = regexp.MustCompile("\\*|\\d+|\\.|\\D+") 195 } 196 if genRegEx == nil { 197 genRegEx = regexp.MustCompile("\\d+|\\D|>|\\D") 198 } 199 if protRegEx == nil { 200 protRegEx = regexp.MustCompile("\\*|\\d+|\\D+") 201 } 202 203 // track highest version per accession 204 highest := make(map[string]int) 205 206 hasNM := false 207 hasNP := false 208 209 // parse substitution 210 parseSubs := func(cls, acc, vrn string) *SPDI { 211 212 pos := "" 213 ins := "" 214 del := "" 215 class := 0 216 htype := 0 217 ok := false 218 219 // parse variation string 220 switch cls { 221 case "g": 222 class = GENOMIC 223 224 arry := genRegEx.FindAllString(vrn, -1) 225 if len(arry) != 4 { 226 // fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variant '%s', array '%v'\n", vrn, arry) 227 return nil 228 } 229 230 pos = arry[0] 231 232 lf := strings.ToLower(arry[1]) 233 rt := strings.ToLower(arry[3]) 234 235 del, ok = naConvert[lf] 236 if !ok { 237 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized nucleotide '%s'\n", lf) 238 return nil 239 } 240 ins, ok = naConvert[rt] 241 if !ok { 242 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized nucleotide '%s'\n", rt) 243 return nil 244 } 245 246 htype = SUBS 247 case "c": 248 class = CODING 249 250 arry := genRegEx.FindAllString(vrn, -1) 251 if len(arry) != 4 { 252 // fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variant '%s', array '%v'\n", vrn, arry) 253 return nil 254 } 255 256 // pos starts at the first nucleotide of the translation initiation codon 257 pos = arry[0] 258 259 lf := strings.ToLower(arry[1]) 260 rt := strings.ToLower(arry[3]) 261 262 del, ok = naConvert[lf] 263 if !ok { 264 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized nucleotide '%s'\n", lf) 265 return nil 266 } 267 ins, ok = naConvert[rt] 268 if !ok { 269 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized nucleotide '%s'\n", rt) 270 return nil 271 } 272 273 htype = SUBS 274 case "p": 275 class = PROTEIN 276 277 arry := protRegEx.FindAllString(vrn, -1) 278 if len(arry) != 3 { 279 // fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized variant '%s', array '%v'\n", vrn, arry) 280 return nil 281 } 282 283 pos = arry[1] 284 285 lf := strings.ToLower(arry[0]) 286 rt := strings.ToLower(arry[2]) 287 288 del, ok = aaConvert[lf] 289 if !ok { 290 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized amino acid '%s'\n", lf) 291 return nil 292 } 293 ins, ok = aaConvert[rt] 294 if !ok { 295 fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized amino acid '%s'\n", rt) 296 return nil 297 } 298 299 htype = MIS 300 if ins == "*" { 301 htype = TRM 302 } else if del == "*" { 303 htype = EXT 304 } else if del == ins { 305 htype = SYN 306 } 307 default: 308 // fmt.Fprintf(os.Stderr, "\nWARNING: Parsing of HGVS class '%s' not yet implemented\n", cls) 309 return nil 310 } 311 312 // position should be unsigned integer 313 if !IsAllDigits(pos) { 314 fmt.Fprintf(os.Stderr, "\nERROR: Non-integer position '%s'\n", pos) 315 return nil 316 } 317 318 num, err := strconv.Atoi(pos) 319 if err != nil { 320 fmt.Fprintf(os.Stderr, "\nERROR: Integer conversion error '%s'\n", err) 321 return nil 322 } 323 // adjust position to 0-based 324 num-- 325 326 spdi := &SPDI{Class: class, Type: htype, Accession: acc, Position: num, Deleted: del, Inserted: ins} 327 328 return spdi 329 } 330 331 // reality checks on variant type, will eventually call the different type-specific parsers 332 parseOneType := func(cls, acc, vrn string) *SPDI { 333 334 if strings.Index(vrn, "delins") >= 0 || strings.Index(vrn, "indel") >= 0 { 335 // fmt.Fprintf(os.Stderr, "\nWARNING: deletion-insertion not yet implemented\n") 336 return nil 337 } 338 if strings.Index(vrn, "del") >= 0 { 339 // fmt.Fprintf(os.Stderr, "\nWARNING: deletion not yet implemented\n") 340 return nil 341 } 342 if strings.Index(vrn, "inv") >= 0 { 343 // fmt.Fprintf(os.Stderr, "\nWARNING: inversion not yet implemented\n") 344 return nil 345 } 346 if strings.Index(vrn, "dup") >= 0 { 347 // fmt.Fprintf(os.Stderr, "\nWARNING: duplication not yet implemented\n") 348 return nil 349 } 350 if strings.Index(vrn, "con") >= 0 { 351 // fmt.Fprintf(os.Stderr, "\nWARNING: conversion not yet implemented\n") 352 return nil 353 } 354 if strings.Index(vrn, "ins") >= 0 { 355 // fmt.Fprintf(os.Stderr, "\nWARNING: insertion not yet implemented\n") 356 return nil 357 } 358 if strings.Index(vrn, "fs") >= 0 { 359 // fmt.Fprintf(os.Stderr, "\nWARNING: frameshift not yet implemented\n") 360 return nil 361 } 362 if strings.Index(vrn, "*") >= 0 { 363 // fmt.Fprintf(os.Stderr, "\nWARNING: UTR variant not yet implemented\n") 364 return nil 365 } 366 if strings.Index(vrn, "?") >= 0 { 367 // fmt.Fprintf(os.Stderr, "\nWARNING: predictions not yet implemented\n") 368 return nil 369 } 370 if strings.Index(vrn, ";") >= 0 { 371 // fmt.Fprintf(os.Stderr, "\nWARNING: variation pairs not yet implemented\n") 372 return nil 373 } 374 if strings.Index(vrn, "/") >= 0 { 375 // fmt.Fprintf(os.Stderr, "\nWARNING: variation pairs not yet implemented\n") 376 return nil 377 } 378 if strings.Index(vrn, "(") >= 0 || strings.Index(vrn, ")") >= 0 { 379 // fmt.Fprintf(os.Stderr, "\nWARNING: intron variation not yet implemented\n") 380 return nil 381 } 382 if strings.Index(vrn, "[") >= 0 || strings.Index(vrn, "]") >= 0 || strings.Index(vrn, ";") >= 0 { 383 // fmt.Fprintf(os.Stderr, "\nWARNING: repetitive stretch not yet implemented\n") 384 return nil 385 } 386 387 // otherwise use substitution 388 return parseSubs(cls, acc, vrn) 389 } 390 391 var buffer strings.Builder 392 393 var spdis []*SPDI 394 395 ok := false 396 397 // trim prefix and suffix 398 str = strings.TrimPrefix(str, "HGVS=") 399 idx := strings.Index(str, "|") 400 if idx >= 0 { 401 str = str[:idx] 402 } 403 404 // separate at commas 405 htgs := strings.Split(str, ",") 406 407 for _, htg := range htgs { 408 409 // skip empty item 410 if htg == "" { 411 continue 412 } 413 414 // extract accession 415 acc, rgt := SplitInTwoLeft(htg, ":") 416 if acc == "" || rgt == "" { 417 // fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse HGVS '%s'\n", htg) 418 continue 419 } 420 // split into type and variation 421 cls, vrn := SplitInTwoLeft(rgt, ".") 422 if cls == "" || vrn == "" { 423 // fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse HGVS '%s'\n", rgt) 424 continue 425 } 426 427 // normalize variant string 428 vrn = strings.TrimSpace(vrn) 429 vrn = strings.ToLower(vrn) 430 431 // predicted protein change enclosed in parentheses 432 vrn = strings.TrimPrefix(vrn, "(") 433 vrn = strings.TrimSuffix(vrn, ")") 434 435 spdi := parseOneType(cls, acc, vrn) 436 if spdi == nil { 437 continue 438 } 439 440 // get accession fields for sorting 441 pfx := "" 442 num := "" 443 ver := "0" 444 445 // use regular expression to handle XX_ and XX_ABCD accessions 446 arry := accnRegEx.FindAllString(acc, -1) 447 if len(arry) == 2 { 448 pfx = strings.ToUpper(arry[0]) 449 num = strings.ToLower(arry[1]) 450 } else if len(arry) == 4 { 451 pfx = strings.ToUpper(arry[0]) 452 num = strings.ToLower(arry[1]) 453 ver = strings.ToLower(arry[3]) 454 if arry[2] != "." { 455 fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse version '%s', arry '%v'\n", acc, arry) 456 continue 457 } 458 } else { 459 fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse accession '%s', arry '%v'\n", acc, arry) 460 continue 461 } 462 463 if pfx == "" || num == "" { 464 fmt.Fprintf(os.Stderr, "\nERROR: Unable to parse accession '%s'\n", acc) 465 continue 466 } 467 468 // RefSeq accession body should be unsigned integer 469 if !IsAllDigits(num) { 470 fmt.Fprintf(os.Stderr, "\nERROR: Non-integer accession body '%s'\n", num) 471 continue 472 } 473 474 val, err := strconv.Atoi(num) 475 if err != nil { 476 fmt.Fprintf(os.Stderr, "\nERROR: Integer conversion error '%s'\n", err) 477 continue 478 } 479 480 vsn, err := strconv.Atoi(ver) 481 if err != nil { 482 fmt.Fprintf(os.Stderr, "\nERROR: Version error '%s'\n", err) 483 continue 484 } 485 486 spdi.Prefix = pfx 487 spdi.Digits = num 488 spdi.Number = val 489 spdi.Version = vsn 490 491 // record highest version for each accession 492 unver := pfx + num 493 vr, found := highest[unver] 494 if !found { 495 highest[unver] = vsn 496 } else if vr > vsn { 497 highest[unver] = vr 498 } 499 500 if strings.HasPrefix(pfx, "NM_") { 501 hasNM = true 502 } 503 if strings.HasPrefix(pfx, "NP_") { 504 hasNP = true 505 } 506 507 spdis = append(spdis, spdi) 508 ok = true 509 } 510 511 if !ok { 512 return "" 513 } 514 515 sort.Slice(spdis, func(i, j int) bool { 516 a := spdis[i] 517 b := spdis[j] 518 if a.Class != b.Class { 519 return a.Class < b.Class 520 } 521 if a.Type != b.Type { 522 return a.Type < b.Type 523 } 524 if a.Prefix != b.Prefix { 525 return a.Prefix < b.Prefix 526 } 527 // numeric comparison of accession digits ignores leading zeros 528 if a.Number != b.Number { 529 return a.Number < b.Number 530 } 531 // most recent version goes first 532 if a.Version != b.Version { 533 return a.Version > b.Version 534 } 535 if a.Position != b.Position { 536 return a.Position < b.Position 537 } 538 if a.Deleted != b.Deleted { 539 return a.Deleted < b.Deleted 540 } 541 if a.Inserted != b.Inserted { 542 return a.Inserted < b.Inserted 543 } 544 return false 545 }) 546 547 lastClass := "" 548 549 for _, itm := range spdis { 550 551 // skip earlier accession versions 552 unver := itm.Prefix + itm.Digits 553 vr, found := highest[unver] 554 if found && vr > itm.Version { 555 continue 556 } 557 558 // skip XM_ and XP_ if NM_ or NP_, respectively, are already present 559 if hasNM && strings.HasPrefix(itm.Accession, "XM_") { 560 continue 561 } 562 if hasNP && strings.HasPrefix(itm.Accession, "XP_") { 563 continue 564 } 565 566 clss := hgvsClass[itm.Class] 567 htyp := hgvsType[itm.Type] 568 569 if lastClass != clss { 570 if lastClass != "" { 571 buffer.WriteString("</" + lastClass + ">\n") 572 } 573 lastClass = clss 574 buffer.WriteString("<" + lastClass + ">\n") 575 } 576 577 // buffer.WriteString("<" + htyp + ">\n") 578 579 pos := strconv.Itoa(itm.Position) 580 lbl := "Position" 581 if itm.Class == CODING { 582 lbl = "Offset" 583 } 584 buffer.WriteString("<Variant>" + 585 "<Type>" + htyp + "</Type>" + 586 "<Accession>" + itm.Accession + "</Accession>" + 587 "<" + lbl + ">" + pos + "</" + lbl + ">" + 588 "<Deleted>" + itm.Deleted + "</Deleted>" + 589 "<Inserted>" + itm.Inserted + "</Inserted>" + 590 "</Variant>\n") 591 592 // buffer.WriteString("</" + htyp + ">\n") 593 } 594 595 if lastClass != "" { 596 buffer.WriteString("</" + lastClass + ">\n") 597 } 598 599 return buffer.String() 600} 601