1// Copyright 2020 Ross Spencer, Richard Lehane. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12// implied. See the License for the specific language governing 13// permissions and limitations under the License. 14 15// Perform some linting on the SPARQL we receive from Wikidata. This is 16// all preliminary stuff where we will still need to wrangle the 17// signatures to be useful in aggregate. Using that as a rule then we 18// only do enough work here to make that wrangling a bit easier later 19// on. 20 21package wikidata 22 23import ( 24 "fmt" 25 "strconv" 26 "strings" 27 28 "github.com/richardlehane/siegfried/pkg/wikidata/internal/converter" 29) 30 31// Create a map to store linting results per Wikidata URI. 32var linter = make(map[string]map[lintingResult]bool) 33 34// lintingResult provides a data structure to store information about 35// errors encountered while trying to process Wikidata records. 36type lintingResult struct { 37 URI string // URI of the Wikidata record. 38 Value linting // Linting error. 39 Critical bool // Critical, true or false. 40} 41 42// addLinting adds linting errors to our linter map when the function 43// is called. 44func addLinting(uri string, value linting) { 45 if value == nle { 46 return 47 } 48 critical := false 49 switch value { 50 case offWDE02: 51 case relWDE02: 52 case heuWDE01: 53 critical = true 54 } 55 linting := lintingResult{} 56 linting.URI = uri 57 linting.Value = value 58 linting.Critical = critical 59 if linter[uri] == nil { 60 lMap := make(map[lintingResult]bool) 61 lMap[linting] = critical 62 linter[uri] = lMap 63 return 64 } 65 linter[uri][linting] = critical 66} 67 68// lintingToString will output our linting errors in an easy to consume 69// slice. 70func lintingToString() []string { 71 var lintingMessages []string 72 for _, result := range linter { 73 for res := range result { 74 s := fmt.Sprintf( 75 "%s: URI: %s Critical: %t", lintingLookup(res.Value), 76 res.URI, 77 res.Critical, 78 ) 79 lintingMessages = append(lintingMessages, s) 80 } 81 } 82 return lintingMessages 83} 84 85// countLintingErrors will count all the linting errors returned during 86// processing. It will return two counts, that of all the records with 87// at least one error, and that of all the individual errors. 88func countLintingErrors() (int, int, int) { 89 var recordCount, individualCount, badHeuristicCount int 90 for _, result := range linter { 91 recordCount++ 92 for res := range result { 93 if res.Value == heuWDE01 || res.Value == heuWDE02 { 94 badHeuristicCount++ 95 } 96 individualCount++ 97 } 98 } 99 return recordCount, individualCount, badHeuristicCount 100} 101 102type linting int 103 104// nle provides a nil for no lint errors. 105const nle = noLintingError 106 107// Linting enumerator. This approach feels like it might be a little 108// old fashioned but it lets us capture as many of the data issues we're 109// seeing in Wikidata as they come up so that they can be fixed. Once 110// we can find better control of the source data I think we'll be able 111// to get rid of this and use a much simpler approach for compiling 112// the set of signatures for the identifier. 113const ( 114 noLintingError linting = iota // noLintingError encodes No linting error. 115 116 // Offset based linting issues. 117 offWDE01 // offWDE01 encodes ErrNoOffset 118 offWDE02 // offWDE02 encodes ErrCannotParseOffset 119 offWDE03 // offWDE03 encodes ErrBlankNodeOffset 120 121 // Relativity based linting issues. 122 relWDE01 // relWDE01 encodes ErrEmptyStringRelativity 123 relWDE02 // relWDE02 encodes ErrUnknownRelativity 124 125 // Encoding based linting issues. 126 encWDE01 // encWDE01 encodes ErrNoEncoding 127 128 // Provenance based linting issues. 129 proWDE01 // proWDE01 encodes ErrNoProvenance 130 proWDE02 // proWDE02 encodes ErrNoDate 131 132 // Sequence based linting issues. 133 seqWDE01 // seqWDE01 encodes ErrDuplicateSequence 134 135 // Heuristic errors. We have to give up on this record. 136 heuWDE01 // heuWDE01 encodes ErrNoHeuristic 137 heuWDE02 // heuWDE02 encodes ErrCannotProcessSequence 138) 139 140// lintingLookup returns a plain-text string for the type of errors or 141// issues that we encounter when trying to process Wikidata records 142// into an identifier. 143func lintingLookup(lint linting) string { 144 switch lint { 145 case offWDE01: 146 return "Linting: WARNING no offset" 147 case offWDE02: 148 return "Linting: ERROR cannot parse offset" 149 case offWDE03: 150 return "Linting: ERROR blank node returned for offset" 151 case relWDE01: 152 return "Linting: WARNING no relativity" 153 case relWDE02: 154 return "Linting: ERROR unknown relativity" 155 case encWDE01: 156 return "Linting: WARNING no encoding" 157 case seqWDE01: 158 return "Linting: ERROR duplicate sequence" 159 case proWDE01: 160 return "Linting: WARNING no provenance" 161 case proWDE02: 162 return "Linting: WARNING no provenance date" 163 case heuWDE01: 164 return "Linting: ERROR bad heuristic" 165 case heuWDE02: 166 return "Linting: ERROR cannot process sequence" 167 case noLintingError: 168 return "Linting: INFO no linting errors" 169 } 170 return "Linting: ERROR unknown linting error" 171} 172 173// preProcessedSequence gives us a way to hold temporary information 174// about the signature associated with a record. 175type preProcessedSequence struct { 176 signature string 177 offset string 178 relativity string 179 encoding string 180} 181 182// relativities as encoded in Wikidata records. 183const ( 184 relativeBOF = "beginning of file" 185 relativeEOF = "end of file" 186) 187 188// validateAndReturnProvenance performs some arbitrary validation on 189// provenance as recorded by Wikidata and let's us know any issues 190// with it. Right now we can only really say if the provenance field 191// is empty, it's not going to be very useful to us. 192func validateAndReturnProvenance(value string) (string, linting) { 193 if value == "" { 194 return value, proWDE01 195 } 196 return value, nle 197} 198 199// validateAndReturnDate will perform some validation on the provenance 200// date we are able to access from Wikidata records. If the value is 201// blank for example, it will return a linting warning. 202func validateAndReturnDate(value string) (string, linting) { 203 if value == "" { 204 return value, proWDE02 205 } 206 return value, nle 207} 208 209// validateAndReturnEncoding asks whether the encoding we can access 210// from Wikidata is known to Siegfried. If it isn't then we know for 211// now that we cannot handle it. If we cannot handle it, we either need 212// to correct the Wikidata record, or add capability to Siegfried or 213// the converter package. 214func validateAndReturnEncoding(value string) (int, linting) { 215 encoding := converter.LookupEncoding(value) 216 if encoding == converter.UnknownEncoding { 217 return encoding, encWDE01 218 } 219 return encoding, nle 220} 221 222// validateAndReturnRelativity will return a string and an error based 223// on whether the relativity of a format identification pattern, e.g. 224// BOF, EOF is known. If it isn't then it makes it more difficult to 225// process in Roy/Siegfried. 226func validateAndReturnRelativity(value string) (string, linting, error) { 227 const unknownRelativity = "Received an unknown relativity" 228 if value == "" { 229 // Assume beginning of file. 230 return relativeBOF, relWDE01, nil 231 } else if strings.ToLower(value) == relativeBOF { 232 return relativeBOF, nle, nil 233 } else if strings.ToLower(value) == relativeEOF { 234 return relativeEOF, nle, nil 235 } 236 return value, relWDE02, fmt.Errorf("%s: '%s'", unknownRelativity, value) 237} 238 239// validateAndReturnOffset will return an integer and an error based on 240// whether we can use the offset delivered by Wikidata. 241func validateAndReturnOffset(value string, nodeType string) (int, linting) { 242 const blankNodeType = "bnode" 243 const blankNodeErr = "Received a blank node type instead of offset" 244 var offset int 245 if value == "" { 246 return offset, nle 247 } else if nodeType == blankNodeType { 248 return offset, offWDE03 249 } 250 offset, err := strconv.Atoi(value) 251 if err != nil { 252 return offset, offWDE02 253 } 254 return offset, nle 255} 256 257// validateAndReturnSignature calls the converter functions to normalize 258// our signature. We need to do this so that we can compare signatures 259// and remove duplicates and identify other errors. 260func validateAndReturnSignature(value string, encoding int) (string, linting, error) { 261 value, _, _, err := converter.Parse(value, encoding) 262 if err != nil { 263 return value, heuWDE02, err 264 } 265 return value, nle, nil 266} 267