1// Copyright 2015 Richard Lehane. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package main
16
17import (
18	"encoding/base64"
19	"fmt"
20	"io"
21	"net/http"
22	"os"
23	"sync"
24	"time"
25
26	"github.com/richardlehane/siegfried"
27	"github.com/richardlehane/siegfried/internal/checksum"
28	"github.com/richardlehane/siegfried/pkg/config"
29	"github.com/richardlehane/siegfried/pkg/writer"
30)
31
32func handleErr(w http.ResponseWriter, status int, e error) {
33	w.WriteHeader(status)
34	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
35	io.WriteString(w, fmt.Sprintf("SF server error; got %v\n", e))
36}
37
38func decodePath(s, b64 string) (string, error) {
39	if len(s) < 11 {
40		return "", fmt.Errorf("path too short, expecting at least 11 characters got %d", len(s))
41	}
42	if b64 == "true" {
43		data, err := base64.URLEncoding.DecodeString(s[10:])
44		if err != nil {
45			return "", fmt.Errorf("Error base64 decoding file path, error message %v", err)
46		}
47		return string(data), nil
48	}
49	return s[10:], nil
50}
51
52func parseRequest(w http.ResponseWriter, r *http.Request, s *siegfried.Siegfried, wg *sync.WaitGroup) (error, string, writer.Writer, bool, bool, bool, checksum.HashTyp, *siegfried.Siegfried, getFn) {
53	// json, csv, droid or yaml
54	paramsErr := func(field, expect string) (error, string, writer.Writer, bool, bool, bool, checksum.HashTyp, *siegfried.Siegfried, getFn) {
55		return fmt.Errorf("bad request; in param %s got %s; valid values %s", field, r.FormValue(field), expect), "", nil, false, false, false, -1, nil, nil
56	}
57	var (
58		mime string
59		wr   writer.Writer
60		d    bool
61		frmt int
62	)
63	switch {
64	case *jsono:
65		frmt = 1
66	case *csvo:
67		frmt = 2
68	case *droido:
69		frmt = 3
70	}
71	if v := r.FormValue("format"); v != "" {
72		switch v {
73		case "yaml":
74			frmt = 0
75		case "json":
76			frmt = 1
77		case "csv":
78			frmt = 2
79		case "droid":
80			frmt = 3
81		default:
82			return paramsErr("format", "yaml, json, csv or droid")
83		}
84	}
85	if accept := r.Header.Get("Accept"); accept != "" {
86		switch accept {
87		case "application/x-yaml":
88			frmt = 0
89		case "application/json":
90			frmt = 1
91		case "text/csv", "application/csv":
92			frmt = 2
93		case "application/x-droid":
94			frmt = 3
95		}
96	}
97	switch frmt {
98	case 0:
99		wr = writer.YAML(w)
100		mime = "application/x-yaml"
101	case 1:
102		wr = writer.JSON(w)
103		mime = "application/json"
104	case 2:
105		wr = writer.CSV(w)
106		mime = "text/csv"
107	case 3:
108		wr = writer.Droid(w)
109		d = true
110		mime = "application/x-droid"
111	}
112	// no recurse
113	norec := *nr
114	if v := r.FormValue("nr"); v != "" {
115		switch v {
116		case "true":
117			norec = true
118		case "false":
119			norec = false
120		default:
121			paramsErr("nr", "true or false")
122		}
123	}
124	// continue on error
125	coerr := *coe
126	if v := r.FormValue("coe"); v != "" {
127		switch v {
128		case "true":
129			coerr = true
130		case "false":
131			coerr = false
132		default:
133			paramsErr("coe", "true or false")
134		}
135	}
136	// archive
137	z := *archive
138	if v := r.FormValue("z"); v != "" {
139		switch v {
140		case "true":
141			z = true
142		case "false":
143			z = false
144		default:
145			paramsErr("z", "true or false")
146		}
147	}
148	// checksum
149	h := *hashf
150	if v := r.FormValue("hash"); v != "" {
151		h = v
152	}
153	ht := checksum.GetHash(h)
154	// sig
155	sf := s
156	if v := r.FormValue("sig"); v != "" {
157		if _, err := os.Stat(config.Local(v)); err != nil {
158			return fmt.Errorf("bad request; sig param should be path to a signature file (absolute or relative to home); got %v", err), "", nil, false, false, false, -1, nil, nil
159		}
160		nsf, err := siegfried.Load(config.Local(v))
161		if err == nil {
162			sf = nsf
163		}
164	}
165	gf := func(path, mime string, mod time.Time, sz int64) *context {
166		c := ctxPool.Get().(*context)
167		c.path, c.mime, c.mod, c.sz = path, mime, mod, sz
168		c.s, c.wg, c.w, c.d, c.z, c.h = sf, wg, wr, d, z, checksum.MakeHash(ht)
169		return c
170	}
171	return nil, mime, wr, coerr, norec, d, ht, sf, gf
172}
173
174func handleIdentify(w http.ResponseWriter, r *http.Request, s *siegfried.Siegfried, ctxts chan *context) {
175	wg := &sync.WaitGroup{}
176	err, mime, wr, coerr, nrec, d, ht, sf, gf := parseRequest(w, r, s, wg)
177	if err != nil {
178		handleErr(w, http.StatusNotFound, err)
179		return
180	}
181	if r.Method == "POST" {
182		f, h, err := r.FormFile("file")
183		if err != nil {
184			handleErr(w, http.StatusNotFound, err)
185			return
186		}
187		defer f.Close()
188		var sz int64
189		var mod time.Time
190		osf, ok := f.(*os.File)
191		if ok {
192			info, err := osf.Stat()
193			if err != nil {
194				handleErr(w, http.StatusInternalServerError, err)
195			}
196			sz = info.Size()
197			mod = info.ModTime()
198		} else {
199			sz = r.ContentLength
200		}
201		w.Header().Set("Content-Type", mime)
202		wr.Head(config.SignatureBase(), time.Now(), sf.C, config.Version(), sf.Identifiers(), sf.Fields(), ht.String())
203		wg.Add(1)
204		ctx := gf(h.Filename, "", mod, sz)
205		ctxts <- ctx
206		identifyRdr(f, ctx, ctxts, gf)
207		wg.Wait()
208		wr.Tail()
209		return
210	} else {
211		path, err := decodePath(r.URL.Path, r.FormValue("base64"))
212		if err == nil {
213			_, err = os.Stat(path)
214		}
215		if err != nil {
216			handleErr(w, http.StatusNotFound, err)
217			return
218		}
219		w.Header().Set("Content-Type", mime)
220		wr.Head(config.SignatureBase(), time.Now(), sf.C, config.Version(), sf.Identifiers(), sf.Fields(), ht.String())
221		err = identify(ctxts, path, "", coerr, nrec, d, gf)
222		wg.Wait()
223		wr.Tail()
224		if err != nil {
225			if _, ok := err.(WalkError); ok { // only dump out walk errors, other errors reported in result
226				io.WriteString(w, err.Error())
227			}
228		}
229		return
230	}
231}
232
233const usage = `
234	<html>
235		<head>
236			<title>Siegfried server</title>
237		</head>
238		<body>
239			<h1><a name="top">Siegfried server usage</a></h1>
240			<p>The siegfried server has two modes of identification:
241			<ul><li><a href="#get_request">GET request</a>, where a file or directory path is given in the URL and the server retrieves the file(s);</li>
242			<li><a href="#post_request">POST request</a>, where the file is sent over the network as form-data.</li></ul></p>
243			<h2>Default settings</h2>
244			<p>When starting the server, you can use regular sf flags to set defaults for the <i>nr</i>, <i>format</i>, <i>hash</i>, <i>z</i>, and <i>sig</i> parameters that will apply to all requests unless overridden. Logging options can also be set.<p>
245			<p>E.g. sf -nr -z -hash md5 -sig pronom-tika.sig -log p,w,e -serve localhost:5138</p>
246			<hr>
247			<h2><a name="get_request">GET request</a></h2>
248			<p><strong>GET</strong> <i>/identify/[file or folder name (percent encoded)](?base64=false&nr=true&format=yaml&hash=md5&z=true&sig=locfdd.sig)</i></p>
249			<p>E.g. http://localhost:5138/identify/c%3A%2FUsers%2Frichardl%2FMy%20Documents%2Fhello%20world.docx?format=json</p>
250			<h3>Parameters</h3>
251			<p><i>base64</i> (optional) - use <a href="https://tools.ietf.org/html/rfc4648#section-5">URL-safe base64 encoding</a> for the file or folder name with base64=true.</p>
252			<p><i>coe</i> (optional) - continue directory scans even when fatal file access errors are encountered with coe=true.</p>
253			<p><i>nr</i> (optional) - stop sub-directory recursion when a directory path is given with nr=true.</p>
254			<p><i>format</i> (optional) - select the output format (csv, yaml, json, droid). Default is yaml. Alternatively, HTTP content negotiation can be used.</p>
255			<p><i>hash</i> (optional) - calculate file checksum (md5, sha1, sha256, sha512, crc)</p>
256			<p><i>z</i> (optional) - scan archive formats (zip, tar, gzip, warc, arc) with z=true. Default is false.</p>
257			<p><i>sig</i> (optional) - load a specific signature file. Default is default.sig.</p>
258			<h3>Example</h2>
259			<!-- set the get target for the example form using js function at bottom page-->
260			<h4>File/ directory:</h4>
261			<p><input type="text" id="filename"> (provide the path to a file or directory e.g. c:\My Documents\file.doc. It will be percent encoded by this form.)</p>
262			<h4>Parameters:</h4>
263			<form method="get" id="get_example">
264			  <p>Use base64 encoding (base64): <input type="radio" name="base64" value="true"> true <input type="radio" name="base64" value="false" checked> false</p>
265			 <p>Continue on error (coe): <input type="radio" name="coe" value="true"> true <input type="radio" name="nr" value="false" checked> false</p>
266			 <p>No directory recursion (nr): <input type="radio" name="nr" value="true"> true <input type="radio" name="nr" value="false" checked> false</p>
267			 <p>Format (format): <select name="format">
268  				<option value="json">json</option>
269  				<option value="yaml">yaml</option>
270  				<option value="csv">csv</option>
271 				<option value="droid">droid</option>
272			</select></p>
273			 <p>Hash (hash): <select name="hash">
274  				<option value="none">none</option>
275  				<option value="md5">md5</option>
276  				<option value="sha1">sha1</option>
277 				<option value="sha256">sha256</option>
278 				<option value="sha512">sha512</option>
279 				<option value="crc">crc</option>
280			</select></p>
281			 <p>Scan archive (z): <input type="radio" name="z" value="true"> true <input type="radio" name="z" value="false" checked> false</p>
282			 <p>Signature file (sig): <input type="text" name="sig"></p>
283			 <p><input type="submit" value="Submit"></p>
284			</form>
285			<p><a href="#top">Back to top</p>
286			<hr>
287			<h2><a name="post_request">POST request</a></h2>
288			<p><strong>POST</strong> <i>/identify(?format=yaml&hash=md5&z=true&sig=locfdd.sig)</i> Attach a file as form-data with the key "file".</p>
289			<p>E.g. curl "http://localhost:5138/identify?format=json&hash=crc" -F file=@myfile.doc</p>
290			<h3>Parameters</h3>
291			<p><i>format</i> (optional) - select the output format (csv, yaml, json, droid). Default is yaml. Alternatively, HTTP content negotiation can be used.</p>
292			<p><i>hash</i> (optional) - calculate file checksum (md5, sha1, sha256, sha512, crc)</p>
293			<p><i>z</i> (optional) - scan archive formats (zip, tar, gzip, warc, arc) with z=true. Default is false.</p>
294			<p><i>sig</i> (optional) - load a specific signature file. Default is default.sig.</p>
295			<h3>Example</h2>
296			<form action="/identify" enctype="multipart/form-data" method="post">
297			 <h4>File:</h4>
298			 <p><input type="file" name="file"></p>
299			 <h4>Parameters:</h4>
300			 <p>Format (format): <select name="format">
301  				<option value="json">json</option>
302  				<option value="yaml">yaml</option>
303  				<option value="csv">csv</option>
304 				<option value="droid">droid</option>
305			</select></p>
306			 <p>Hash (hash): <select name="hash">
307  				<option value="none">none</option>
308  				<option value="md5">md5</option>
309  				<option value="sha1">sha1</option>
310 				<option value="sha256">sha256</option>
311 				<option value="sha512">sha512</option>
312 				<option value="crc">crc</option>
313			</select></p>
314			 <p>Scan archive (z): <input type="radio" name="z" value="true"> true <input type="radio" name="z" value="false" checked> false</p>
315			 <p>Signature file (sig): <input type="text" name="sig"></p>
316			 <p><input type="submit" value="Submit"></p>
317			</form>
318			<p><a href="#top">Back to top</p>
319			<script>
320				var input = document.getElementById('filename');
321				input.addEventListener('input', function()
322				{
323					var frm = document.getElementById('get_example');
324   				    frm.action = "/identify/" + encodeURIComponent(input.value);
325				});
326			</script>
327		</body>
328	</html>
329`
330
331func handleMain(w http.ResponseWriter, r *http.Request) {
332	w.Header().Set("Content-Type", "text/html")
333	io.WriteString(w, usage)
334}
335
336type muxer struct {
337	s     *siegfried.Siegfried
338	ctxts chan *context
339}
340
341func (m *muxer) ServeHTTP(w http.ResponseWriter, r *http.Request) {
342	if (len(r.URL.Path) == 0 || r.URL.Path == "/") && r.Method == "GET" {
343		handleMain(w, r)
344		return
345	}
346	if len(r.URL.Path) >= 9 && r.URL.Path[:9] == "/identify" {
347		handleIdentify(w, r, m.s, m.ctxts)
348		return
349	}
350	handleErr(w, http.StatusNotFound, fmt.Errorf("valid paths are /, /identify and /identify/*"))
351	return
352}
353
354func listen(port string, s *siegfried.Siegfried, ctxts chan *context) {
355	mux := &muxer{s, ctxts}
356	http.ListenAndServe(port, mux)
357}
358