1 /*  multipart.c -- GA4GH redirection and multipart backend for file streams.
2 
3     Copyright (C) 2016-2017 Genome Research Ltd.
4 
5     Author: John Marshall <jm18@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
26 #include <config.h>
27 
28 #include <stdio.h>
29 #include <string.h>
30 #include <errno.h>
31 
32 #include "htslib/kstring.h"
33 
34 #include "hts_internal.h"
35 #include "hfile_internal.h"
36 
37 #ifndef EPROTO
38 #define EPROTO ENOEXEC
39 #endif
40 
41 typedef struct hfile_part {
42     char *url;
43     char **headers;
44 } hfile_part;
45 
46 typedef struct {
47     hFILE base;
48     hfile_part *parts;
49     size_t nparts, maxparts, current;
50     hFILE *currentfp;
51 } hFILE_multipart;
52 
free_part(hfile_part * p)53 static void free_part(hfile_part *p)
54 {
55     free(p->url);
56     if (p->headers) {
57         char **hdr;
58         for (hdr = p->headers; *hdr; hdr++) free(*hdr);
59         free(p->headers);
60     }
61 
62     p->url = NULL;
63     p->headers = NULL;
64 }
65 
free_all_parts(hFILE_multipart * fp)66 static void free_all_parts(hFILE_multipart *fp)
67 {
68     size_t i;
69     for (i = 0; i < fp->nparts; i++) free_part(&fp->parts[i]);
70     free(fp->parts);
71 }
72 
multipart_read(hFILE * fpv,void * buffer,size_t nbytes)73 static ssize_t multipart_read(hFILE *fpv, void *buffer, size_t nbytes)
74 {
75     hFILE_multipart *fp = (hFILE_multipart *) fpv;
76     size_t n;
77 
78 open_next:
79     if (fp->currentfp == NULL) {
80         if (fp->current < fp->nparts) {
81             const hfile_part *p = &fp->parts[fp->current];
82             hts_log_debug("Opening part #%zu of %zu: \"%.120s%s\"",
83                 fp->current+1, fp->nparts, p->url,
84                 (strlen(p->url) > 120)? "..." : "");
85 
86             fp->currentfp = p->headers?
87                   hopen(p->url, "r:",
88                         "httphdr:v", p->headers,
89                         "auth_token_enabled", "false", NULL)
90                 : hopen(p->url, "r:", "auth_token_enabled", "false", NULL);
91 
92             if (fp->currentfp == NULL) return -1;
93         }
94         else return 0;  // No more parts, so we're truly at EOF
95     }
96 
97     n = fp->currentfp->mobile?
98           fp->currentfp->backend->read(fp->currentfp, buffer, nbytes)
99         : hread(fp->currentfp, buffer, nbytes);
100 
101     if (n == 0) {
102         // We're at EOF on this part, so set up the next part
103         hFILE *prevfp = fp->currentfp;
104         free_part(&fp->parts[fp->current]);
105         fp->current++;
106         fp->currentfp = NULL;
107         if (hclose(prevfp) < 0) return -1;
108         goto open_next;
109     }
110 
111     return n;  // Number of bytes read by (or an error from) fp->currentfp
112 }
113 
multipart_write(hFILE * fpv,const void * buffer,size_t nbytes)114 static ssize_t multipart_write(hFILE *fpv, const void *buffer, size_t nbytes)
115 {
116     errno = EROFS;
117     return -1;
118 }
119 
multipart_seek(hFILE * fpv,off_t offset,int whence)120 static off_t multipart_seek(hFILE *fpv, off_t offset, int whence)
121 {
122     errno = ESPIPE;
123     return -1;
124 }
125 
multipart_close(hFILE * fpv)126 static int multipart_close(hFILE *fpv)
127 {
128     hFILE_multipart *fp = (hFILE_multipart *) fpv;
129 
130     free_all_parts(fp);
131     if (fp->currentfp) {
132         if (hclose(fp->currentfp) < 0) return -1;
133     }
134 
135     return 0;
136 }
137 
138 static const struct hFILE_backend multipart_backend =
139 {
140     multipart_read, multipart_write, multipart_seek, NULL, multipart_close
141 };
142 
143 // Returns 'v' (valid value), 'i' (invalid; required GA4GH field missing),
144 // or upon encountering an unexpected token, that token's type.
145 // Explicit `return '?'` means a JSON parsing error, typically a member key
146 // that is not a string.  An unexpected token may be a valid token that was
147 // not the type expected for a particular GA4GH field, or it may be '?' or
148 // '\0' which should be propagated.
149 static char
parse_ga4gh_body_json(hFILE_multipart * fp,hFILE * json,kstring_t * b,kstring_t * header)150 parse_ga4gh_body_json(hFILE_multipart *fp, hFILE *json,
151                       kstring_t *b, kstring_t *header)
152 {
153     hts_json_token t;
154 
155     if (hts_json_fnext(json, &t, b) != '{') return t.type;
156     while (hts_json_fnext(json, &t, b) != '}') {
157         if (t.type != 's') return '?';
158 
159         if (strcmp(t.str, "urls") == 0) {
160             if (hts_json_fnext(json, &t, b) != '[') return t.type;
161 
162             while (hts_json_fnext(json, &t, b) != ']') {
163                 hfile_part *part;
164                 size_t n = 0, max = 0;
165 
166                 hts_expand(hfile_part, fp->nparts+1, fp->maxparts, fp->parts);
167                 part = &fp->parts[fp->nparts++];
168                 part->url = NULL;
169                 part->headers = NULL;
170 
171                 if (t.type != '{') return t.type;
172                 while (hts_json_fnext(json, &t, b) != '}') {
173                     if (t.type != 's') return '?';
174 
175                     if (strcmp(t.str, "url") == 0) {
176                         if (hts_json_fnext(json, &t, b) != 's') return t.type;
177                         part->url = ks_release(b);
178                     }
179                     else if (strcmp(t.str, "headers") == 0) {
180                         if (hts_json_fnext(json, &t, b) != '{') return t.type;
181 
182                         while (hts_json_fnext(json, &t, header) != '}') {
183                             if (t.type != 's') return '?';
184 
185                             if (hts_json_fnext(json, &t, b) != 's')
186                                 return t.type;
187 
188                             kputs(": ", header);
189                             kputs(t.str, header);
190                             n++;
191                             hts_expand(char *, n+1, max, part->headers);
192                             part->headers[n-1] = ks_release(header);
193                             part->headers[n] = NULL;
194                         }
195                     }
196                     else if (hts_json_fskip_value(json, '\0') != 'v')
197                         return '?';
198                 }
199 
200                 if (! part->url) return 'i';
201             }
202         }
203         else if (strcmp(t.str, "format") == 0) {
204             if (hts_json_fnext(json, &t, b) != 's') return t.type;
205 
206             hts_log_debug("GA4GH JSON redirection to multipart %s data", t.str);
207         }
208         else if (hts_json_fskip_value(json, '\0') != 'v') return '?';
209     }
210 
211     return 'v';
212 }
213 
214 // Returns 'v' (valid value), 'i' (invalid; required GA4GH field missing),
215 // or upon encountering an unexpected token, that token's type.
216 // Explicit `return '?'` means a JSON parsing error, typically a member key
217 // that is not a string.  An unexpected token may be a valid token that was
218 // not the type expected for a particular GA4GH field, or it may be '?' or
219 // '\0' which should be propagated.
220 static char
parse_ga4gh_redirect_json(hFILE_multipart * fp,hFILE * json,kstring_t * b,kstring_t * header)221 parse_ga4gh_redirect_json(hFILE_multipart *fp, hFILE *json,
222                           kstring_t *b, kstring_t *header) {
223     hts_json_token t;
224 
225     if (hts_json_fnext(json, &t, b) != '{') return t.type;
226     while (hts_json_fnext(json, &t, b) != '}') {
227         if (t.type != 's') return '?';
228 
229         if (strcmp(t.str, "htsget") == 0) {
230             char ret = parse_ga4gh_body_json(fp, json, b, header);
231             if (ret != 'v') return ret;
232         }
233         else return '?';
234     }
235 
236     if (hts_json_fnext(json, &t, b) != '\0') return '?';
237 
238     return 'v';
239 }
240 
hopen_htsget_redirect(hFILE * hfile,const char * mode)241 hFILE *hopen_htsget_redirect(hFILE *hfile, const char *mode)
242 {
243     hFILE_multipart *fp;
244     kstring_t s1 = { 0, 0, NULL }, s2 = { 0, 0, NULL };
245     char ret;
246 
247     fp = (hFILE_multipart *) hfile_init(sizeof (hFILE_multipart), mode, 0);
248     if (fp == NULL) return NULL;
249 
250     fp->parts = NULL;
251     fp->nparts = fp->maxparts = 0;
252 
253     ret = parse_ga4gh_redirect_json(fp, hfile, &s1, &s2);
254     free(s1.s);
255     free(s2.s);
256     if (ret != 'v') {
257         free_all_parts(fp);
258         hfile_destroy((hFILE *) fp);
259         errno = (ret == '?' || ret == '\0')? EPROTO : EINVAL;
260         return NULL;
261     }
262 
263     fp->current = 0;
264     fp->currentfp = NULL;
265     fp->base.backend = &multipart_backend;
266     return &fp->base;
267 }
268