1// Copyright 2020 Ross Spencer, Richard Lehane. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12// implied. See the License for the specific language governing 13// permissions and limitations under the License. 14 15// Satisfies the Identifier interface. 16 17package wikidata 18 19import ( 20 "encoding/json" 21 "fmt" 22 "log" 23 "strings" 24 "time" 25 26 "github.com/richardlehane/siegfried/internal/identifier" 27 "github.com/richardlehane/siegfried/pkg/config" 28 "github.com/richardlehane/siegfried/pkg/core" 29) 30 31const unknown = "UNKNOWN" 32const identifierDateFormat = "2006-01-02" 33 34// Initialize the variables needed by this file. 35func init() { 36 core.RegisterIdentifier(core.Wikidata, Load) 37} 38 39// Identifier contains a set of Wikidata records and an implementation 40// of the identifier interface for consuming. 41type Identifier struct { 42 infos map[string]formatInfo 43 *identifier.Base 44} 45 46// Global that allows us to do is keep track of the PUIDs going to be 47// output in the identifier which need provenance. At least it felt 48// needed at the time, but need to look at in more detail. We may 49// eventually delete this in favor of something "less-global". 50var sourcePuids []string 51 52// New is the entry point for an Identifier when it is compiled by the Roy tool 53// to a brand new signature file. 54// 55// New will read a Wikidata report, and parse its information into structures 56// suitable for compilation by Roy. 57// 58// New will also update its identification information with provenance-like 59// info. It will enable signature extensions to be added by the utility, and 60// enables configuration to be applied as well. 61// 62func New(opts ...config.Option) (core.Identifier, error) { 63 for _, v := range opts { 64 v() 65 } 66 log.Println("Roy (Wikidata): Congratulations: doing something with the Wikidata identifier package!") 67 wikidata, puids, err := newWikidata() 68 if err != nil { 69 return nil, fmt.Errorf("Roy (Wikidata): error in New Wikidata: %s", err) 70 } 71 // Having retrieved our PUIDs from newWikidata, assign them to our 72 // provenance global to generate source information from Wikidata. 73 sourcePuids = puids 74 updatedDate := time.Now().Format(identifierDateFormat) 75 wikidata = identifier.ApplyConfig(wikidata) 76 base := identifier.New( 77 wikidata, 78 "Wikidata Name: I don't think this field is used...", 79 updatedDate, 80 ) 81 infos := infos(wikidata.Infos()) 82 return &Identifier{ 83 infos: infos, 84 Base: base, 85 }, nil 86} 87 88// Recorder provides a recorder for matching. 89func (i *Identifier) Recorder() core.Recorder { 90 return &Recorder{ 91 Identifier: i, 92 ids: make(matchIDs, 0, 1), 93 } 94} 95 96// Identification contains the result of a single ID for a file. There may be 97// multiple, per file. The identification to the user looks something like as 98// follows: 99// 100// - ns : 'wikidata' 101// id : 'Q1343830' 102// format : 'Executable and Linkable Format' 103// URI : 'http://www.wikidata.org/entity/Q1343830' 104// mime : 105// basis : 'byte match at 0, 4 (signature 1/5); byte match at 0, 7 (signature 4/5)' 106// source : 'Gary Kessler''s File Signature Table (source date: 2017-08-08) PRONOM (Official (fmt/689))' 107// warning : 108// 109type Identification struct { 110 Namespace string // Namespace of the identifier, e.g. this will be the 'wikidata' namespace. 111 ID string // QID of the file format according to Wikidata. 112 Name string // Complete name of the format identification. Often includes version. 113 LongName string // IRI of the Wikidata record. 114 MIME string // MIMEtypes associated with the record. 115 Basis []string // Basis for the result returned by Siegfried. 116 Source []string // Provenance information associated with the result. 117 Warning string // Warnings generated by Siegfried. 118 archive config.Archive // Is it an Archive format? 119 confidence int // Identification confidence for sorting. 120} 121 122// String creates a human readable representation of an identifier for output 123// by fmt-like functions. 124func (id Identification) String() string { 125 str, err := json.MarshalIndent(id, "", " ") 126 if err != nil { 127 return "" 128 } 129 return fmt.Sprintf("%s", str) 130} 131 132// Fields describes a portion of YAML that will be output by Siegfried's 133// identifier for an individual match. E.g. 134// 135// matches : 136// - ns : 'wikidata' 137// id : 'Q475488' 138// format : 'EPUB' 139// ... : '...' 140// ... : '...' 141// custom : 'your custom field' 142// custom : '...' 143// 144// siegfried/pkg/writer/writer.go normalizes the output of this field 145// grouping so that if it sees certain fields, e.g. namespace, then it 146// can convert that to something anticipated by the consumer, 147// 148// e.g. namespace => becomes => ns 149// 150func (i *Identifier) Fields() []string { 151 // Results with extra source field we can populate with provenance 152 // information. 153 var resultsFieldsWithSource = []string{ 154 "namespace", 155 "id", 156 "format", 157 "URI", 158 "mime", 159 "basis", 160 "source", 161 "warning", 162 } 163 // Result field without source field. This is a little more like 164 // other identifiers used in Siegfried. 165 var resultsFueldsWithoutSource = []string{ 166 "namespace", 167 "id", 168 "format", 169 "URI", 170 "mime", 171 "basis", 172 "warning", 173 } 174 if config.GetWikidataSourceField() { 175 return resultsFieldsWithSource 176 } 177 return resultsFueldsWithoutSource 178} 179 180// Archive should tell us if any identifiers match those considered to 181// be an archive format so that they can be extracted and the contents 182// identified. 183func (id Identification) Archive() config.Archive { 184 return id.archive 185} 186 187// Known returns false if the ID isn't recognized or true if so. 188func (id Identification) Known() bool { 189 return id.ID != unknown 190} 191 192// Warn returns the warning associated with an identification. 193func (id Identification) Warn() string { 194 return id.Warning 195} 196 197// Values returns a string slice containing each of the identifier segments. 198func (id Identification) Values() []string { 199 var basis string 200 var source string 201 if len(id.Basis) > 0 { 202 basis = strings.Join(id.Basis, "; ") 203 } 204 if config.GetWikidataSourceField() { 205 if len(id.Source) > 0 { 206 if id.Source[0] != "" { 207 source = strings.Join(id.Source, "; ") 208 } 209 source = strings.TrimSpace(strings.Join(id.Source, " ")) 210 } 211 // Slice must match the order of resultsFieldsWithSource. 212 return []string{ 213 id.Namespace, 214 id.ID, 215 id.Name, 216 id.LongName, 217 id.MIME, 218 basis, 219 source, 220 id.Warning, 221 } 222 } 223 // Slice must match the order of resultsFueldsWithoutSource. 224 return []string{ 225 id.Namespace, 226 id.ID, 227 id.Name, 228 id.LongName, 229 id.MIME, 230 basis, 231 id.Warning, 232 } 233} 234