1// Copyright 2014 Richard Lehane. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package pronom 16 17import ( 18 "encoding/hex" 19 "errors" 20 "strconv" 21 "strings" 22 23 "github.com/richardlehane/siegfried/internal/bytematcher/frames" 24 "github.com/richardlehane/siegfried/internal/bytematcher/patterns" 25 "github.com/richardlehane/siegfried/pkg/pronom/internal/mappings" 26) 27 28// This code produces siegfried bytematcher signatures from the relevant parts of PRONOM, Droid and Container XML signature files 29 30const ( 31 pronombof = "Absolute from BOF" 32 pronomeof = "Absolute from EOF" 33 pronomvry = "Variable" 34 droidbof = "BOFoffset" 35 droideof = "EOFoffset" 36) 37 38// helper 39func decodeNum(num string) (int, error) { 40 if strings.TrimSpace(num) == "" { 41 return 0, nil 42 } 43 return strconv.Atoi(num) 44} 45 46// PROCompatSequence (compatibility) provides access to the PRONON 47// primitive mappings.ByteSequence for custom identifier types that 48// want to make use of PRONOM's level of expression. 49type PROCompatSequence = mappings.ByteSequence 50 51// BeginningOfFile provides access to PRONOM's BOF const. 52const BeginningOfFile = pronombof 53 54// EndOfFile provides access to PRONOM's EOF const. 55const EndOfFile = pronomeof 56 57// FormatPRONOM is an external helper function for enabling the 58// processing of a significant number of signature types compatible with 59// the PRONOM standard from plain-old hex, to more complex PRONOM regex. 60func FormatPRONOM(id string, ps []PROCompatSequence) (frames.Signature, error) { 61 signature := mappings.Signature{} 62 signature.ByteSequences = ps 63 return processPRONOM(id, signature) 64} 65 66// PRONOM 67func processPRONOM(puid string, s mappings.Signature) (frames.Signature, error) { 68 sig := make(frames.Signature, 0, 1) 69 for _, bs := range s.ByteSequences { 70 // check if <Offset> or <MaxOffset> elements are present 71 min, err := decodeNum(bs.Offset) 72 if err != nil { 73 return nil, err 74 } 75 max, err := decodeNum(bs.MaxOffset) 76 if err != nil { 77 return nil, err 78 } 79 // lack of a max offset implies a fixed offset for BOF and EOF seqs (not VAR) 80 if max == 0 { 81 max = min 82 } else { 83 max = max + min // the max offset in a PRONOM report is relative to the "offset" value, not to the BOF/EOF 84 } 85 var eof bool 86 if bs.Position == pronomeof { 87 eof = true 88 } 89 // parse the hexstring 90 seg, lmin, lmax, err := process(puid, bs.Hex, eof) 91 if err != nil { 92 return nil, err 93 } 94 // check position and add patterns to signature 95 switch bs.Position { 96 case pronombof: 97 if seg[0].Min != 0 || seg[0].Max != 0 { 98 min, max = seg[0].Min, seg[0].Max 99 } 100 seg[0] = frames.NewFrame(frames.BOF, seg[0].Pattern, min, max) 101 case pronomvry: 102 if max == 0 { 103 max = -1 104 } 105 if seg[0].Min != 0 || seg[0].Max != 0 { 106 min, max = seg[0].Min, seg[0].Max 107 } 108 if min == max { 109 max = -1 110 } 111 seg[0] = frames.NewFrame(frames.BOF, seg[0].Pattern, min, max) 112 case pronomeof: 113 if len(seg) > 1 { 114 for i, f := range seg[:len(seg)-1] { 115 seg[i] = frames.NewFrame(frames.SUCC, f.Pattern, seg[i+1].Min, seg[i+1].Max) 116 } 117 } 118 // handle edge case where there is a {x-y} at end of EOF seq e.g. x-fmt/263 119 if lmin != 0 || lmax != 0 { 120 min, max = lmin, lmax 121 } 122 seg[len(seg)-1] = frames.NewFrame(frames.EOF, seg[len(seg)-1].Pattern, min, max) 123 default: 124 return nil, errors.New("Pronom parse error: invalid ByteSequence position " + bs.Position) 125 } 126 // add the segment to the complete signature 127 sig = appendSig(sig, seg, bs.Position) 128 } 129 return sig, nil 130} 131 132// merge two segments into a signature. Provide s2's pos 133func appendSig(s1, s2 frames.Signature, pos string) frames.Signature { 134 if len(s1) == 0 { 135 return s2 136 } 137 // if s2 is an EOF - just append it 138 if pos == pronomeof || pos == droideof { 139 return append(s1, s2...) 140 } 141 // if s1 has an EOF segment, and s2 is a BOF or Var, prepend that s2 segment before it, but after any preceding segments 142 for i, f := range s1 { 143 orientation := f.Orientation() 144 if orientation == frames.SUCC || orientation == frames.EOF { 145 s3 := make(frames.Signature, len(s1)+len(s2)) 146 copy(s3, s1[:i]) 147 copy(s3[i:], s2) 148 copy(s3[i+len(s2):], s1[i:]) 149 return s3 150 } 151 } 152 // default is just to append it 153 return append(s1, s2...) 154} 155 156// DROID & Container 157func processDROID(puid string, s []mappings.ByteSeq) (frames.Signature, error) { 158 var sig frames.Signature 159 for _, b := range s { 160 var eof, vry bool 161 ref := b.Reference 162 if ref == droideof { 163 eof = true 164 } else if ref == "" { 165 vry = true 166 } 167 for _, ss := range b.SubSequences { 168 ns, err := processSubSequence(puid, ss, eof, vry) 169 if err != nil { 170 return nil, err 171 } 172 sig = appendSig(sig, ns, ref) 173 } 174 } 175 return sig, nil 176} 177 178func processSubSequence(puid string, ss mappings.SubSequence, eof, vry bool) (frames.Signature, error) { 179 sig, _, _, err := process(puid, ss.Sequence, eof) 180 if err != nil { 181 return nil, err 182 } 183 if len(ss.LeftFragments) > 0 { 184 sig, err = appendFragments(puid, sig, ss.LeftFragments, true, eof) 185 if err != nil { 186 return nil, err 187 } 188 } 189 if len(ss.RightFragments) > 0 { 190 sig, err = appendFragments(puid, sig, ss.RightFragments, false, eof) 191 if err != nil { 192 return nil, err 193 } 194 } 195 if ss.Position > 1 { 196 vry = true 197 } 198 calcOffset := func(minS, maxS string, vry bool) (int, int, error) { 199 min, err := decodeNum(minS) 200 if err != nil { 201 return 0, 0, err 202 } 203 if maxS == "" { 204 if vry { 205 return min, -1, nil 206 } 207 return min, min, nil // if not var - max should be at least min (which is prob 0) 208 } 209 max, err := decodeNum(maxS) 210 if err != nil { 211 return 0, 0, err 212 } 213 if max == 0 { // fix bug fmt/837 where has a min but no max 214 max = min 215 } 216 return min, max, nil 217 } 218 min, max, err := calcOffset(ss.SubSeqMinOffset, ss.SubSeqMaxOffset, vry) 219 if err != nil { 220 return nil, err 221 } 222 if eof { 223 if ss.Position == 1 { 224 sig[len(sig)-1] = frames.NewFrame(frames.EOF, sig[len(sig)-1].Pattern, min, max) 225 } else { 226 sig[len(sig)-1] = frames.NewFrame(frames.SUCC, sig[len(sig)-1].Pattern, min, max) 227 } 228 } else { 229 if ss.Position == 1 { 230 sig[0] = frames.NewFrame(frames.BOF, sig[0].Pattern, min, max) 231 } else { 232 sig[0] = frames.NewFrame(frames.PREV, sig[0].Pattern, min, max) 233 } 234 } 235 return sig, nil 236} 237 238// append a slice of fragments (left or right) to the central droid sequence 239func appendFragments(puid string, sig frames.Signature, frags []mappings.Fragment, left, eof bool) (frames.Signature, error) { 240 // First off, group the fragments: 241 // droid fragments (right or left) can share positions. If such fragments have same offsets, they are a patterns.Choice. If not, then err. 242 var maxPos int 243 for _, f := range frags { 244 if f.Position == 0 { 245 return nil, errors.New("Pronom: encountered fragment without a position, puid " + puid) 246 } 247 if f.Position > maxPos { 248 maxPos = f.Position 249 } 250 } 251 fs := make([][]mappings.Fragment, maxPos) 252 for _, f := range frags { 253 fs[f.Position-1] = append(fs[f.Position-1], f) 254 } 255 for _, r := range fs { 256 max, min := r[0].MaxOffset, r[0].MinOffset 257 for _, v := range r { 258 if v.MaxOffset != max || v.MinOffset != min { 259 return nil, errors.New("Pronom: encountered fragments at same positions with different offsets, puid " + puid) 260 } 261 } 262 } 263 typ := frames.PREV 264 if eof { 265 typ = frames.SUCC 266 } 267 var choice patterns.Choice 268 offs := make([][2]int, len(fs)) 269 ns := make([]frames.Signature, len(fs)) 270 // iterate over the grouped fragments 271 for i, v := range fs { 272 if len(v) > 1 { 273 choice = patterns.Choice{} 274 for _, c := range v { 275 pats, _, _, err := process(puid, c.Value, eof) 276 if err != nil { 277 return nil, err 278 } 279 if len(pats) > 1 { 280 list := make(patterns.List, len(pats)) 281 for i, v := range pats { 282 list[i] = v.Pattern 283 } 284 choice = append(choice, list) 285 } else { 286 choice = append(choice, pats[0].Pattern) 287 } 288 } 289 ns[i] = frames.Signature{frames.NewFrame(typ, choice, 0, 0)} 290 } else { 291 pats, _, _, err := process(puid, v[0].Value, eof) 292 if err != nil { 293 return nil, err 294 } 295 ns[i] = pats 296 } 297 min, err := decodeNum(v[0].MinOffset) 298 if err != nil { 299 return nil, err 300 } 301 var max int 302 if v[0].MaxOffset == "" { 303 max = -1 304 } else { 305 max, err = decodeNum(v[0].MaxOffset) 306 if err != nil { 307 return nil, err 308 } 309 } 310 offs[i] = [2]int{min, max} 311 } 312 // Now make the frames by adding in offset information (if left fragments, this needs to be taken from their neighbour) 313 if left { 314 if eof { 315 for i, v := range ns { 316 v[len(v)-1] = frames.NewFrame(frames.SUCC, v[len(v)-1].Pattern, offs[i][0], offs[i][1]) 317 sig = append(v, sig...) 318 } 319 } else { 320 for i, v := range ns { 321 sig[0] = frames.NewFrame(frames.PREV, sig[0].Pattern, offs[i][0], offs[i][1]) 322 sig = append(v, sig...) 323 } 324 } 325 } else { 326 if eof { 327 for i, v := range ns { 328 sig[len(sig)-1] = frames.NewFrame(frames.SUCC, sig[len(sig)-1].Pattern, offs[i][0], offs[i][1]) 329 sig = append(sig, v...) 330 } 331 332 } else { 333 for i, v := range ns { 334 v[0] = frames.NewFrame(frames.PREV, v[0].Pattern, offs[i][0], offs[i][1]) 335 sig = append(sig, v...) 336 } 337 } 338 } 339 return sig, nil 340} 341 342// Shared code for processing raw lex outputs in PRONOM/Container pattern language 343 344func process(puid, seq string, eof bool) (frames.Signature, int, int, error) { 345 if seq == "" { 346 return nil, 0, 0, errors.New("parse error " + puid + ": empty sequence") 347 } 348 typ := frames.PREV 349 if eof { 350 typ = frames.SUCC 351 } 352 var min, max int 353 l := lexPRONOM(puid, seq) 354 sig := frames.Signature{} 355 for i := l.nextItem(); i.typ != itemEOF; i = l.nextItem() { 356 switch i.typ { 357 case itemError: 358 return nil, 0, 0, errors.New("parse error " + puid + ": " + i.String()) 359 case itemWildSingle: 360 min++ 361 max++ 362 case itemWildStart: 363 min, _ = decodeNum(i.val) 364 case itemCurlyRight: //detect {n} wildcards by checking if the max value has been set 365 if max == 0 { 366 max = min 367 } 368 case itemWildEnd: 369 if i.val == "*" { 370 max = -1 371 } else { 372 max, _ = decodeNum(i.val) 373 } 374 case itemWild: 375 max = -1 376 case itemEnterGroup: 377 pat, err := processGroup(l) 378 if err != nil { 379 return nil, 0, 0, errors.New("parse error " + puid + ": " + err.Error()) 380 } 381 sig = append(sig, frames.NewFrame(typ, pat, min, max)) 382 min, max = 0, 0 383 case itemUnprocessedText: 384 sig = append(sig, frames.NewFrame(typ, patterns.Sequence(processText(i.val)), min, max)) 385 min, max = 0, 0 386 } 387 } 388 return sig, min, max, nil 389} 390 391func processText(hx string) []byte { 392 var buf []byte 393 l := lexText(hx) 394 for i := range l.items { 395 switch i.typ { 396 case itemHexText: 397 byts, _ := hex.DecodeString(i.val) 398 buf = append(buf, byts...) 399 case itemQuoteText: 400 buf = append(buf, []byte(i.val)...) 401 case itemError: 402 panic(i.val) 403 case itemEOF: 404 return buf 405 } 406 } 407 // ignore err, the hex string has been lexed 408 return buf 409} 410 411// groups are chunks of PRONOM/Droid patterns delimited by parentheses or brackets 412// these chunks represent any non-sequence pattern (choices, ranges, bitmasks, not-patterns etc.) 413func processGroup(l *lexer) (patterns.Pattern, error) { 414 var ( 415 list patterns.List // bucket to stuff patterns into 416 choice patterns.Choice // bucket to stuff choices into 417 val []byte // bucket to stuff text values 418 not, mask, anyMask, rng bool // retains state from previous tokens 419 ) 420 // when commit a pattern (to the list), go back to zero state 421 reset := func() { 422 val = []byte{} 423 not, mask, anyMask, rng = false, false, false, false 424 } 425 // make a pattern based on the current state 426 makePat := func() patterns.Pattern { 427 if len(val) == 0 { 428 return nil 429 } 430 var pat patterns.Pattern 431 switch { 432 case mask: 433 pat = patterns.Mask(val[0]) 434 case anyMask: 435 pat = patterns.AnyMask(val[0]) 436 default: 437 pat = patterns.Sequence(val) 438 } 439 if not { 440 pat = patterns.Not{pat} 441 } 442 reset() 443 return pat 444 } 445 // add patterns to the choice 446 addChoice := func() (patterns.Choice, error) { 447 switch len(list) { 448 case 0: 449 return nil, errors.New(l.name + " has choice marker without preceding pattern") 450 case 1: 451 choice = append(choice, list[0]) 452 default: 453 choice = append(choice, list) 454 } 455 list = patterns.List{} 456 return choice, nil 457 } 458 for { 459 i := <-l.items 460 switch i.typ { 461 default: 462 return nil, errors.New(l.name + " encountered unexpected token " + i.val) 463 case itemEnterGroup: // recurse e.g. for a range nested within a choice 464 if pat := makePat(); pat != nil { 465 list = append(list, pat) 466 } 467 pat, err := processGroup(l) 468 if err != nil { 469 return nil, err 470 } 471 list = append(list, pat) 472 case itemExitGroup: 473 if pat := makePat(); pat != nil { 474 list = append(list, pat) 475 } 476 if len(choice) > 0 { 477 return addChoice() 478 } else { 479 switch len(list) { 480 case 0: 481 return nil, errors.New(l.name + " has group with no legal pattern") 482 case 1: 483 return list[0], nil 484 default: 485 return list, nil 486 } 487 } 488 case itemRangeMarker: 489 rng = true 490 case itemChoiceMarker: 491 if pat := makePat(); pat != nil { 492 list = append(list, pat) 493 } 494 _, err := addChoice() 495 if err != nil { 496 return nil, err 497 } 498 case itemNotMarker: 499 not = true 500 case itemMaskMarker: 501 mask = true 502 case itemAnyMaskMarker: 503 anyMask = true 504 case itemUnprocessedText: 505 v := processText(i.val) 506 // if it is a range, we need values before and after the range marker, so add it here 507 if rng { 508 r := Range{val, v} 509 if not { 510 list = append(list, patterns.Not{r}) 511 } else { 512 list = append(list, r) 513 } 514 reset() 515 } else { 516 val = v 517 } 518 } 519 } 520} 521