1// Copyright 2020 Ross Spencer, Richard Lehane. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12// implied. See the License for the specific language governing
13// permissions and limitations under the License.
14
15// Perform some linting on the SPARQL we receive from Wikidata. This is
16// all preliminary stuff where we will still need to wrangle the
17// signatures to be useful in aggregate. Using that as a rule then we
18// only do enough work here to make that wrangling a bit easier later
19// on.
20
21package wikidata
22
23import (
24	"fmt"
25	"strconv"
26	"strings"
27
28	"github.com/richardlehane/siegfried/pkg/wikidata/internal/converter"
29)
30
31// Create a map to store linting results per Wikidata URI.
32var linter = make(map[string]map[lintingResult]bool)
33
34// lintingResult provides a data structure to store information about
35// errors encountered while trying to process Wikidata records.
36type lintingResult struct {
37	URI      string  // URI of the Wikidata record.
38	Value    linting // Linting error.
39	Critical bool    // Critical, true or false.
40}
41
42// addLinting adds linting errors to our linter map when the function
43// is called.
44func addLinting(uri string, value linting) {
45	if value == nle {
46		return
47	}
48	critical := false
49	switch value {
50	case offWDE02:
51	case relWDE02:
52	case heuWDE01:
53		critical = true
54	}
55	linting := lintingResult{}
56	linting.URI = uri
57	linting.Value = value
58	linting.Critical = critical
59	if linter[uri] == nil {
60		lMap := make(map[lintingResult]bool)
61		lMap[linting] = critical
62		linter[uri] = lMap
63		return
64	}
65	linter[uri][linting] = critical
66}
67
68// lintingToString will output our linting errors in an easy to consume
69// slice.
70func lintingToString() []string {
71	var lintingMessages []string
72	for _, result := range linter {
73		for res := range result {
74			s := fmt.Sprintf(
75				"%s: URI: %s Critical: %t", lintingLookup(res.Value),
76				res.URI,
77				res.Critical,
78			)
79			lintingMessages = append(lintingMessages, s)
80		}
81	}
82	return lintingMessages
83}
84
85// countLintingErrors will count all the linting errors returned during
86// processing. It will return two counts, that of all the records with
87// at least one error, and that of all the individual errors.
88func countLintingErrors() (int, int, int) {
89	var recordCount, individualCount, badHeuristicCount int
90	for _, result := range linter {
91		recordCount++
92		for res := range result {
93			if res.Value == heuWDE01 || res.Value == heuWDE02 {
94				badHeuristicCount++
95			}
96			individualCount++
97		}
98	}
99	return recordCount, individualCount, badHeuristicCount
100}
101
102type linting int
103
104// nle provides a nil for no lint errors.
105const nle = noLintingError
106
107// Linting enumerator. This approach feels like it might be a little
108// old fashioned but it lets us capture as many of the data issues we're
109// seeing in Wikidata as they come up so that they can be fixed. Once
110// we can find better control of the source data I think we'll be able
111// to get rid of this and use a much simpler approach for compiling
112// the set of signatures for the identifier.
113const (
114	noLintingError linting = iota // noLintingError encodes No linting error.
115
116	// Offset based linting issues.
117	offWDE01 // offWDE01 encodes ErrNoOffset
118	offWDE02 // offWDE02 encodes ErrCannotParseOffset
119	offWDE03 // offWDE03 encodes ErrBlankNodeOffset
120
121	// Relativity based linting issues.
122	relWDE01 // relWDE01 encodes ErrEmptyStringRelativity
123	relWDE02 // relWDE02 encodes ErrUnknownRelativity
124
125	// Encoding based linting issues.
126	encWDE01 // encWDE01 encodes ErrNoEncoding
127
128	// Provenance based linting issues.
129	proWDE01 // proWDE01 encodes ErrNoProvenance
130	proWDE02 // proWDE02 encodes ErrNoDate
131
132	// Sequence based linting issues.
133	seqWDE01 // seqWDE01 encodes ErrDuplicateSequence
134
135	// Heuristic errors. We have to give up on this record.
136	heuWDE01 // heuWDE01 encodes ErrNoHeuristic
137	heuWDE02 // heuWDE02 encodes ErrCannotProcessSequence
138)
139
140// lintingLookup returns a plain-text string for the type of errors or
141// issues that we encounter when trying to process Wikidata records
142// into an identifier.
143func lintingLookup(lint linting) string {
144	switch lint {
145	case offWDE01:
146		return "Linting: WARNING no offset"
147	case offWDE02:
148		return "Linting: ERROR cannot parse offset"
149	case offWDE03:
150		return "Linting: ERROR blank node returned for offset"
151	case relWDE01:
152		return "Linting: WARNING no relativity"
153	case relWDE02:
154		return "Linting: ERROR unknown relativity"
155	case encWDE01:
156		return "Linting: WARNING no encoding"
157	case seqWDE01:
158		return "Linting: ERROR duplicate sequence"
159	case proWDE01:
160		return "Linting: WARNING no provenance"
161	case proWDE02:
162		return "Linting: WARNING no provenance date"
163	case heuWDE01:
164		return "Linting: ERROR bad heuristic"
165	case heuWDE02:
166		return "Linting: ERROR cannot process sequence"
167	case noLintingError:
168		return "Linting: INFO no linting errors"
169	}
170	return "Linting: ERROR unknown linting error"
171}
172
173// preProcessedSequence gives us a way to hold temporary information
174// about the signature associated with a record.
175type preProcessedSequence struct {
176	signature  string
177	offset     string
178	relativity string
179	encoding   string
180}
181
182// relativities as encoded in Wikidata records.
183const (
184	relativeBOF = "beginning of file"
185	relativeEOF = "end of file"
186)
187
188// validateAndReturnProvenance performs some arbitrary validation on
189// provenance as recorded by Wikidata and let's us know any issues
190// with it. Right now we can only really say if the provenance field
191// is empty, it's not going to be very useful to us.
192func validateAndReturnProvenance(value string) (string, linting) {
193	if value == "" {
194		return value, proWDE01
195	}
196	return value, nle
197}
198
199// validateAndReturnDate will perform some validation on the provenance
200// date we are able to access from Wikidata records. If the value is
201// blank for example, it will return a linting warning.
202func validateAndReturnDate(value string) (string, linting) {
203	if value == "" {
204		return value, proWDE02
205	}
206	return value, nle
207}
208
209// validateAndReturnEncoding asks whether the encoding we can access
210// from Wikidata is known to Siegfried. If it isn't then we know for
211// now that we cannot handle it. If we cannot handle it, we either need
212// to correct the Wikidata record, or add capability to Siegfried or
213// the converter package.
214func validateAndReturnEncoding(value string) (int, linting) {
215	encoding := converter.LookupEncoding(value)
216	if encoding == converter.UnknownEncoding {
217		return encoding, encWDE01
218	}
219	return encoding, nle
220}
221
222// validateAndReturnRelativity will return a string and an error based
223// on whether the relativity of a format identification pattern, e.g.
224// BOF, EOF is known. If it isn't then it makes it more difficult to
225// process in Roy/Siegfried.
226func validateAndReturnRelativity(value string) (string, linting, error) {
227	const unknownRelativity = "Received an unknown relativity"
228	if value == "" {
229		// Assume beginning of file.
230		return relativeBOF, relWDE01, nil
231	} else if strings.ToLower(value) == relativeBOF {
232		return relativeBOF, nle, nil
233	} else if strings.ToLower(value) == relativeEOF {
234		return relativeEOF, nle, nil
235	}
236	return value, relWDE02, fmt.Errorf("%s: '%s'", unknownRelativity, value)
237}
238
239// validateAndReturnOffset will return an integer and an error based on
240// whether we can use the offset delivered by Wikidata.
241func validateAndReturnOffset(value string, nodeType string) (int, linting) {
242	const blankNodeType = "bnode"
243	const blankNodeErr = "Received a blank node type instead of offset"
244	var offset int
245	if value == "" {
246		return offset, nle
247	} else if nodeType == blankNodeType {
248		return offset, offWDE03
249	}
250	offset, err := strconv.Atoi(value)
251	if err != nil {
252		return offset, offWDE02
253	}
254	return offset, nle
255}
256
257// validateAndReturnSignature calls the converter functions to normalize
258// our signature. We need to do this so that we can compare signatures
259// and remove duplicates and identify other errors.
260func validateAndReturnSignature(value string, encoding int) (string, linting, error) {
261	value, _, _, err := converter.Parse(value, encoding)
262	if err != nil {
263		return value, heuWDE02, err
264	}
265	return value, nle, nil
266}
267