1// Copyright 2020 Ross Spencer, Richard Lehane. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12// implied. See the License for the specific language governing
13// permissions and limitations under the License.
14
15// Satisfies the Identifier interface.
16
17package wikidata
18
19import (
20	"encoding/json"
21	"fmt"
22	"log"
23	"strings"
24	"time"
25
26	"github.com/richardlehane/siegfried/internal/identifier"
27	"github.com/richardlehane/siegfried/pkg/config"
28	"github.com/richardlehane/siegfried/pkg/core"
29)
30
31const unknown = "UNKNOWN"
32const identifierDateFormat = "2006-01-02"
33
34// Initialize the variables needed by this file.
35func init() {
36	core.RegisterIdentifier(core.Wikidata, Load)
37}
38
39// Identifier contains a set of Wikidata records and an implementation
40// of the identifier interface for consuming.
41type Identifier struct {
42	infos map[string]formatInfo
43	*identifier.Base
44}
45
46// Global that allows us to do is keep track of the PUIDs going to be
47// output in the identifier which need provenance. At least it felt
48// needed at the time, but need to look at in more detail. We may
49// eventually delete this in favor of something "less-global".
50var sourcePuids []string
51
52// New is the entry point for an Identifier when it is compiled by the Roy tool
53// to a brand new signature file.
54//
55// New will read a Wikidata report, and parse its information into structures
56// suitable for compilation by Roy.
57//
58// New will also update its identification information with provenance-like
59// info. It will enable signature extensions to be added by the utility, and
60// enables configuration to be applied as well.
61//
62func New(opts ...config.Option) (core.Identifier, error) {
63	for _, v := range opts {
64		v()
65	}
66	log.Println("Roy (Wikidata): Congratulations: doing something with the Wikidata identifier package!")
67	wikidata, puids, err := newWikidata()
68	if err != nil {
69		return nil, fmt.Errorf("Roy (Wikidata): error in New Wikidata: %s", err)
70	}
71	// Having retrieved our PUIDs from newWikidata, assign them to our
72	// provenance global to generate source information from Wikidata.
73	sourcePuids = puids
74	updatedDate := time.Now().Format(identifierDateFormat)
75	wikidata = identifier.ApplyConfig(wikidata)
76	base := identifier.New(
77		wikidata,
78		"Wikidata Name: I don't think this field is used...",
79		updatedDate,
80	)
81	infos := infos(wikidata.Infos())
82	return &Identifier{
83		infos: infos,
84		Base:  base,
85	}, nil
86}
87
88// Recorder provides a recorder for matching.
89func (i *Identifier) Recorder() core.Recorder {
90	return &Recorder{
91		Identifier: i,
92		ids:        make(matchIDs, 0, 1),
93	}
94}
95
96// Identification contains the result of a single ID for a file. There may be
97// multiple, per file. The identification to the user looks something like as
98// follows:
99//
100//  - ns      : 'wikidata'
101//    id      : 'Q1343830'
102//    format  : 'Executable and Linkable Format'
103//    URI     : 'http://www.wikidata.org/entity/Q1343830'
104//    mime    :
105//    basis   : 'byte match at 0, 4 (signature 1/5); byte match at 0, 7 (signature 4/5)'
106//    source  : 'Gary Kessler''s File Signature Table (source date: 2017-08-08) PRONOM (Official (fmt/689))'
107//    warning :
108//
109type Identification struct {
110	Namespace  string         // Namespace of the identifier, e.g. this will be the 'wikidata' namespace.
111	ID         string         // QID of the file format according to Wikidata.
112	Name       string         // Complete name of the format identification. Often includes version.
113	LongName   string         // IRI of the Wikidata record.
114	MIME       string         // MIMEtypes associated with the record.
115	Basis      []string       // Basis for the result returned by Siegfried.
116	Source     []string       // Provenance information associated with the result.
117	Warning    string         // Warnings generated by Siegfried.
118	archive    config.Archive // Is it an Archive format?
119	confidence int            // Identification confidence for sorting.
120}
121
122// String creates a human readable representation of an identifier for output
123// by fmt-like functions.
124func (id Identification) String() string {
125	str, err := json.MarshalIndent(id, "", "  ")
126	if err != nil {
127		return ""
128	}
129	return fmt.Sprintf("%s", str)
130}
131
132// Fields describes a portion of YAML that will be output by Siegfried's
133// identifier for an individual match. E.g.
134//
135//      matches  :
136//        - ns      : 'wikidata'
137//          id      : 'Q475488'
138//          format  : 'EPUB'
139//          ...     : '...'
140//          ...     : '...'
141//          custom  : 'your custom field'
142//          custom  : '...'
143//
144// siegfried/pkg/writer/writer.go normalizes the output of this field
145// grouping so that if it sees certain fields, e.g. namespace, then it
146// can convert that to something anticipated by the consumer,
147//
148//      e.g. namespace => becomes => ns
149//
150func (i *Identifier) Fields() []string {
151	// Results with extra source field we can populate with provenance
152	// information.
153	var resultsFieldsWithSource = []string{
154		"namespace",
155		"id",
156		"format",
157		"URI",
158		"mime",
159		"basis",
160		"source",
161		"warning",
162	}
163	// Result field without source field. This is a little more like
164	// other identifiers used in Siegfried.
165	var resultsFueldsWithoutSource = []string{
166		"namespace",
167		"id",
168		"format",
169		"URI",
170		"mime",
171		"basis",
172		"warning",
173	}
174	if config.GetWikidataSourceField() {
175		return resultsFieldsWithSource
176	}
177	return resultsFueldsWithoutSource
178}
179
180// Archive should tell us if any identifiers match those considered to
181// be an archive format so that they can be extracted and the contents
182// identified.
183func (id Identification) Archive() config.Archive {
184	return id.archive
185}
186
187// Known returns false if the ID isn't recognized or true if so.
188func (id Identification) Known() bool {
189	return id.ID != unknown
190}
191
192// Warn returns the warning associated with an identification.
193func (id Identification) Warn() string {
194	return id.Warning
195}
196
197// Values returns a string slice containing each of the identifier segments.
198func (id Identification) Values() []string {
199	var basis string
200	var source string
201	if len(id.Basis) > 0 {
202		basis = strings.Join(id.Basis, "; ")
203	}
204	if config.GetWikidataSourceField() {
205		if len(id.Source) > 0 {
206			if id.Source[0] != "" {
207				source = strings.Join(id.Source, "; ")
208			}
209			source = strings.TrimSpace(strings.Join(id.Source, " "))
210		}
211		// Slice must match the order of resultsFieldsWithSource.
212		return []string{
213			id.Namespace,
214			id.ID,
215			id.Name,
216			id.LongName,
217			id.MIME,
218			basis,
219			source,
220			id.Warning,
221		}
222	}
223	// Slice must match the order of resultsFueldsWithoutSource.
224	return []string{
225		id.Namespace,
226		id.ID,
227		id.Name,
228		id.LongName,
229		id.MIME,
230		basis,
231		id.Warning,
232	}
233}
234