1// Copyright 2014 Richard Lehane. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package pronom
16
17import (
18	"encoding/hex"
19	"errors"
20	"strconv"
21	"strings"
22
23	"github.com/richardlehane/siegfried/internal/bytematcher/frames"
24	"github.com/richardlehane/siegfried/internal/bytematcher/patterns"
25	"github.com/richardlehane/siegfried/pkg/pronom/internal/mappings"
26)
27
28// This code produces siegfried bytematcher signatures from the relevant parts of PRONOM, Droid and Container XML signature files
29
30const (
31	pronombof = "Absolute from BOF"
32	pronomeof = "Absolute from EOF"
33	pronomvry = "Variable"
34	droidbof  = "BOFoffset"
35	droideof  = "EOFoffset"
36)
37
38// helper
39func decodeNum(num string) (int, error) {
40	if strings.TrimSpace(num) == "" {
41		return 0, nil
42	}
43	return strconv.Atoi(num)
44}
45
46// PROCompatSequence (compatibility) provides access to the PRONON
47// primitive mappings.ByteSequence for custom identifier types that
48// want to make use of PRONOM's level of expression.
49type PROCompatSequence = mappings.ByteSequence
50
51// BeginningOfFile provides access to PRONOM's BOF const.
52const BeginningOfFile = pronombof
53
54// EndOfFile provides access to PRONOM's EOF const.
55const EndOfFile = pronomeof
56
57// FormatPRONOM is an external helper function for enabling the
58// processing of a significant number of signature types compatible with
59// the PRONOM standard from plain-old hex, to more complex PRONOM regex.
60func FormatPRONOM(id string, ps []PROCompatSequence) (frames.Signature, error) {
61	signature := mappings.Signature{}
62	signature.ByteSequences = ps
63	return processPRONOM(id, signature)
64}
65
66// PRONOM
67func processPRONOM(puid string, s mappings.Signature) (frames.Signature, error) {
68	sig := make(frames.Signature, 0, 1)
69	for _, bs := range s.ByteSequences {
70		// check if <Offset> or <MaxOffset> elements are present
71		min, err := decodeNum(bs.Offset)
72		if err != nil {
73			return nil, err
74		}
75		max, err := decodeNum(bs.MaxOffset)
76		if err != nil {
77			return nil, err
78		}
79		// lack of a max offset implies a fixed offset for BOF and EOF seqs (not VAR)
80		if max == 0 {
81			max = min
82		} else {
83			max = max + min // the max offset in a PRONOM report is relative to the "offset" value, not to the BOF/EOF
84		}
85		var eof bool
86		if bs.Position == pronomeof {
87			eof = true
88		}
89		// parse the hexstring
90		seg, lmin, lmax, err := process(puid, bs.Hex, eof)
91		if err != nil {
92			return nil, err
93		}
94		// check position and add patterns to signature
95		switch bs.Position {
96		case pronombof:
97			if seg[0].Min != 0 || seg[0].Max != 0 {
98				min, max = seg[0].Min, seg[0].Max
99			}
100			seg[0] = frames.NewFrame(frames.BOF, seg[0].Pattern, min, max)
101		case pronomvry:
102			if max == 0 {
103				max = -1
104			}
105			if seg[0].Min != 0 || seg[0].Max != 0 {
106				min, max = seg[0].Min, seg[0].Max
107			}
108			if min == max {
109				max = -1
110			}
111			seg[0] = frames.NewFrame(frames.BOF, seg[0].Pattern, min, max)
112		case pronomeof:
113			if len(seg) > 1 {
114				for i, f := range seg[:len(seg)-1] {
115					seg[i] = frames.NewFrame(frames.SUCC, f.Pattern, seg[i+1].Min, seg[i+1].Max)
116				}
117			}
118			// handle edge case where there is a {x-y} at end of EOF seq e.g. x-fmt/263
119			if lmin != 0 || lmax != 0 {
120				min, max = lmin, lmax
121			}
122			seg[len(seg)-1] = frames.NewFrame(frames.EOF, seg[len(seg)-1].Pattern, min, max)
123		default:
124			return nil, errors.New("Pronom parse error: invalid ByteSequence position " + bs.Position)
125		}
126		// add the segment to the complete signature
127		sig = appendSig(sig, seg, bs.Position)
128	}
129	return sig, nil
130}
131
132// merge two segments into a signature. Provide s2's pos
133func appendSig(s1, s2 frames.Signature, pos string) frames.Signature {
134	if len(s1) == 0 {
135		return s2
136	}
137	// if s2 is an EOF - just append it
138	if pos == pronomeof || pos == droideof {
139		return append(s1, s2...)
140	}
141	// if s1 has an EOF segment, and s2 is a BOF or Var, prepend that s2 segment before it, but after any preceding segments
142	for i, f := range s1 {
143		orientation := f.Orientation()
144		if orientation == frames.SUCC || orientation == frames.EOF {
145			s3 := make(frames.Signature, len(s1)+len(s2))
146			copy(s3, s1[:i])
147			copy(s3[i:], s2)
148			copy(s3[i+len(s2):], s1[i:])
149			return s3
150		}
151	}
152	// default is just to append it
153	return append(s1, s2...)
154}
155
156// DROID & Container
157func processDROID(puid string, s []mappings.ByteSeq) (frames.Signature, error) {
158	var sig frames.Signature
159	for _, b := range s {
160		var eof, vry bool
161		ref := b.Reference
162		if ref == droideof {
163			eof = true
164		} else if ref == "" {
165			vry = true
166		}
167		for _, ss := range b.SubSequences {
168			ns, err := processSubSequence(puid, ss, eof, vry)
169			if err != nil {
170				return nil, err
171			}
172			sig = appendSig(sig, ns, ref)
173		}
174	}
175	return sig, nil
176}
177
178func processSubSequence(puid string, ss mappings.SubSequence, eof, vry bool) (frames.Signature, error) {
179	sig, _, _, err := process(puid, ss.Sequence, eof)
180	if err != nil {
181		return nil, err
182	}
183	if len(ss.LeftFragments) > 0 {
184		sig, err = appendFragments(puid, sig, ss.LeftFragments, true, eof)
185		if err != nil {
186			return nil, err
187		}
188	}
189	if len(ss.RightFragments) > 0 {
190		sig, err = appendFragments(puid, sig, ss.RightFragments, false, eof)
191		if err != nil {
192			return nil, err
193		}
194	}
195	if ss.Position > 1 {
196		vry = true
197	}
198	calcOffset := func(minS, maxS string, vry bool) (int, int, error) {
199		min, err := decodeNum(minS)
200		if err != nil {
201			return 0, 0, err
202		}
203		if maxS == "" {
204			if vry {
205				return min, -1, nil
206			}
207			return min, min, nil // if not var - max should be at least min (which is prob 0)
208		}
209		max, err := decodeNum(maxS)
210		if err != nil {
211			return 0, 0, err
212		}
213		if max == 0 { // fix bug fmt/837 where has a min but no max
214			max = min
215		}
216		return min, max, nil
217	}
218	min, max, err := calcOffset(ss.SubSeqMinOffset, ss.SubSeqMaxOffset, vry)
219	if err != nil {
220		return nil, err
221	}
222	if eof {
223		if ss.Position == 1 {
224			sig[len(sig)-1] = frames.NewFrame(frames.EOF, sig[len(sig)-1].Pattern, min, max)
225		} else {
226			sig[len(sig)-1] = frames.NewFrame(frames.SUCC, sig[len(sig)-1].Pattern, min, max)
227		}
228	} else {
229		if ss.Position == 1 {
230			sig[0] = frames.NewFrame(frames.BOF, sig[0].Pattern, min, max)
231		} else {
232			sig[0] = frames.NewFrame(frames.PREV, sig[0].Pattern, min, max)
233		}
234	}
235	return sig, nil
236}
237
238// append a slice of fragments (left or right) to the central droid sequence
239func appendFragments(puid string, sig frames.Signature, frags []mappings.Fragment, left, eof bool) (frames.Signature, error) {
240	// First off, group the fragments:
241	// droid fragments (right or left) can share positions. If such fragments have same offsets, they are a patterns.Choice. If not, then err.
242	var maxPos int
243	for _, f := range frags {
244		if f.Position == 0 {
245			return nil, errors.New("Pronom: encountered fragment without a position, puid " + puid)
246		}
247		if f.Position > maxPos {
248			maxPos = f.Position
249		}
250	}
251	fs := make([][]mappings.Fragment, maxPos)
252	for _, f := range frags {
253		fs[f.Position-1] = append(fs[f.Position-1], f)
254	}
255	for _, r := range fs {
256		max, min := r[0].MaxOffset, r[0].MinOffset
257		for _, v := range r {
258			if v.MaxOffset != max || v.MinOffset != min {
259				return nil, errors.New("Pronom: encountered fragments at same positions with different offsets, puid " + puid)
260			}
261		}
262	}
263	typ := frames.PREV
264	if eof {
265		typ = frames.SUCC
266	}
267	var choice patterns.Choice
268	offs := make([][2]int, len(fs))
269	ns := make([]frames.Signature, len(fs))
270	// iterate over the grouped fragments
271	for i, v := range fs {
272		if len(v) > 1 {
273			choice = patterns.Choice{}
274			for _, c := range v {
275				pats, _, _, err := process(puid, c.Value, eof)
276				if err != nil {
277					return nil, err
278				}
279				if len(pats) > 1 {
280					list := make(patterns.List, len(pats))
281					for i, v := range pats {
282						list[i] = v.Pattern
283					}
284					choice = append(choice, list)
285				} else {
286					choice = append(choice, pats[0].Pattern)
287				}
288			}
289			ns[i] = frames.Signature{frames.NewFrame(typ, choice, 0, 0)}
290		} else {
291			pats, _, _, err := process(puid, v[0].Value, eof)
292			if err != nil {
293				return nil, err
294			}
295			ns[i] = pats
296		}
297		min, err := decodeNum(v[0].MinOffset)
298		if err != nil {
299			return nil, err
300		}
301		var max int
302		if v[0].MaxOffset == "" {
303			max = -1
304		} else {
305			max, err = decodeNum(v[0].MaxOffset)
306			if err != nil {
307				return nil, err
308			}
309		}
310		offs[i] = [2]int{min, max}
311	}
312	// Now make the frames by adding in offset information (if left fragments, this needs to be taken from their neighbour)
313	if left {
314		if eof {
315			for i, v := range ns {
316				v[len(v)-1] = frames.NewFrame(frames.SUCC, v[len(v)-1].Pattern, offs[i][0], offs[i][1])
317				sig = append(v, sig...)
318			}
319		} else {
320			for i, v := range ns {
321				sig[0] = frames.NewFrame(frames.PREV, sig[0].Pattern, offs[i][0], offs[i][1])
322				sig = append(v, sig...)
323			}
324		}
325	} else {
326		if eof {
327			for i, v := range ns {
328				sig[len(sig)-1] = frames.NewFrame(frames.SUCC, sig[len(sig)-1].Pattern, offs[i][0], offs[i][1])
329				sig = append(sig, v...)
330			}
331
332		} else {
333			for i, v := range ns {
334				v[0] = frames.NewFrame(frames.PREV, v[0].Pattern, offs[i][0], offs[i][1])
335				sig = append(sig, v...)
336			}
337		}
338	}
339	return sig, nil
340}
341
342// Shared code for processing raw lex outputs in PRONOM/Container pattern language
343
344func process(puid, seq string, eof bool) (frames.Signature, int, int, error) {
345	if seq == "" {
346		return nil, 0, 0, errors.New("parse error " + puid + ": empty sequence")
347	}
348	typ := frames.PREV
349	if eof {
350		typ = frames.SUCC
351	}
352	var min, max int
353	l := lexPRONOM(puid, seq)
354	sig := frames.Signature{}
355	for i := l.nextItem(); i.typ != itemEOF; i = l.nextItem() {
356		switch i.typ {
357		case itemError:
358			return nil, 0, 0, errors.New("parse error " + puid + ": " + i.String())
359		case itemWildSingle:
360			min++
361			max++
362		case itemWildStart:
363			min, _ = decodeNum(i.val)
364		case itemCurlyRight: //detect {n} wildcards by checking if the max value has been set
365			if max == 0 {
366				max = min
367			}
368		case itemWildEnd:
369			if i.val == "*" {
370				max = -1
371			} else {
372				max, _ = decodeNum(i.val)
373			}
374		case itemWild:
375			max = -1
376		case itemEnterGroup:
377			pat, err := processGroup(l)
378			if err != nil {
379				return nil, 0, 0, errors.New("parse error " + puid + ": " + err.Error())
380			}
381			sig = append(sig, frames.NewFrame(typ, pat, min, max))
382			min, max = 0, 0
383		case itemUnprocessedText:
384			sig = append(sig, frames.NewFrame(typ, patterns.Sequence(processText(i.val)), min, max))
385			min, max = 0, 0
386		}
387	}
388	return sig, min, max, nil
389}
390
391func processText(hx string) []byte {
392	var buf []byte
393	l := lexText(hx)
394	for i := range l.items {
395		switch i.typ {
396		case itemHexText:
397			byts, _ := hex.DecodeString(i.val)
398			buf = append(buf, byts...)
399		case itemQuoteText:
400			buf = append(buf, []byte(i.val)...)
401		case itemError:
402			panic(i.val)
403		case itemEOF:
404			return buf
405		}
406	}
407	// ignore err, the hex string has been lexed
408	return buf
409}
410
411// groups are chunks of PRONOM/Droid patterns delimited by parentheses or brackets
412// these chunks represent any non-sequence pattern (choices, ranges, bitmasks, not-patterns etc.)
413func processGroup(l *lexer) (patterns.Pattern, error) {
414	var (
415		list                    patterns.List   // bucket to stuff patterns into
416		choice                  patterns.Choice // bucket to stuff choices into
417		val                     []byte          // bucket to stuff text values
418		not, mask, anyMask, rng bool            // retains state from previous tokens
419	)
420	// when commit a pattern (to the list), go back to zero state
421	reset := func() {
422		val = []byte{}
423		not, mask, anyMask, rng = false, false, false, false
424	}
425	// make a pattern based on the current state
426	makePat := func() patterns.Pattern {
427		if len(val) == 0 {
428			return nil
429		}
430		var pat patterns.Pattern
431		switch {
432		case mask:
433			pat = patterns.Mask(val[0])
434		case anyMask:
435			pat = patterns.AnyMask(val[0])
436		default:
437			pat = patterns.Sequence(val)
438		}
439		if not {
440			pat = patterns.Not{pat}
441		}
442		reset()
443		return pat
444	}
445	// add patterns to the choice
446	addChoice := func() (patterns.Choice, error) {
447		switch len(list) {
448		case 0:
449			return nil, errors.New(l.name + " has choice marker without preceding pattern")
450		case 1:
451			choice = append(choice, list[0])
452		default:
453			choice = append(choice, list)
454		}
455		list = patterns.List{}
456		return choice, nil
457	}
458	for {
459		i := <-l.items
460		switch i.typ {
461		default:
462			return nil, errors.New(l.name + " encountered unexpected token " + i.val)
463		case itemEnterGroup: // recurse e.g. for a range nested within a choice
464			if pat := makePat(); pat != nil {
465				list = append(list, pat)
466			}
467			pat, err := processGroup(l)
468			if err != nil {
469				return nil, err
470			}
471			list = append(list, pat)
472		case itemExitGroup:
473			if pat := makePat(); pat != nil {
474				list = append(list, pat)
475			}
476			if len(choice) > 0 {
477				return addChoice()
478			} else {
479				switch len(list) {
480				case 0:
481					return nil, errors.New(l.name + " has group with no legal pattern")
482				case 1:
483					return list[0], nil
484				default:
485					return list, nil
486				}
487			}
488		case itemRangeMarker:
489			rng = true
490		case itemChoiceMarker:
491			if pat := makePat(); pat != nil {
492				list = append(list, pat)
493			}
494			_, err := addChoice()
495			if err != nil {
496				return nil, err
497			}
498		case itemNotMarker:
499			not = true
500		case itemMaskMarker:
501			mask = true
502		case itemAnyMaskMarker:
503			anyMask = true
504		case itemUnprocessedText:
505			v := processText(i.val)
506			// if it is a range, we need values before and after the range marker, so add it here
507			if rng {
508				r := Range{val, v}
509				if not {
510					list = append(list, patterns.Not{r})
511				} else {
512					list = append(list, r)
513				}
514				reset()
515			} else {
516				val = v
517			}
518		}
519	}
520}
521