1 /*  multipart.c -- GA4GH redirection and multipart backend for file streams.
2 
3     Copyright (C) 2016 Genome Research Ltd.
4 
5     Author: John Marshall <jm18@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #include <config.h>
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include <errno.h>
30 
31 #include "htslib/kstring.h"
32 
33 #include "hts_internal.h"
34 #include "hfile_internal.h"
35 
36 #ifndef EPROTO
37 #define EPROTO ENOEXEC
38 #endif
39 
40 typedef struct hfile_part {
41     char *url;
42     char **headers;
43 } hfile_part;
44 
45 typedef struct {
46     hFILE base;
47     hfile_part *parts;
48     size_t nparts, maxparts, current;
49     hFILE *currentfp;
50 } hFILE_multipart;
51 
free_part(hfile_part * p)52 static void free_part(hfile_part *p)
53 {
54     free(p->url);
55     if (p->headers) {
56         char **hdr;
57         for (hdr = p->headers; *hdr; hdr++) free(*hdr);
58         free(p->headers);
59     }
60 
61     p->url = NULL;
62     p->headers = NULL;
63 }
64 
free_all_parts(hFILE_multipart * fp)65 static void free_all_parts(hFILE_multipart *fp)
66 {
67     size_t i;
68     for (i = 0; i < fp->nparts; i++) free_part(&fp->parts[i]);
69     free(fp->parts);
70 }
71 
multipart_read(hFILE * fpv,void * buffer,size_t nbytes)72 static ssize_t multipart_read(hFILE *fpv, void *buffer, size_t nbytes)
73 {
74     hFILE_multipart *fp = (hFILE_multipart *) fpv;
75     size_t n;
76 
77 open_next:
78     if (fp->currentfp == NULL) {
79         if (fp->current < fp->nparts) {
80             const hfile_part *p = &fp->parts[fp->current];
81             hts_log_debug("Opening part #%zu of %zu: \"%.120s%s\"",
82                 fp->current+1, fp->nparts, p->url,
83                 (strlen(p->url) > 120)? "..." : "");
84 
85             fp->currentfp = p->headers?
86                   hopen(p->url, "r:",
87                         "httphdr:v", p->headers,
88                         "auth_token_enabled", "false", NULL)
89                 : hopen(p->url, "r:", "auth_token_enabled", "false", NULL);
90 
91             if (fp->currentfp == NULL) return -1;
92         }
93         else return 0;  // No more parts, so we're truly at EOF
94     }
95 
96     n = fp->currentfp->mobile?
97           fp->currentfp->backend->read(fp->currentfp, buffer, nbytes)
98         : hread(fp->currentfp, buffer, nbytes);
99 
100     if (n == 0) {
101         // We're at EOF on this part, so set up the next part
102         hFILE *prevfp = fp->currentfp;
103         free_part(&fp->parts[fp->current]);
104         fp->current++;
105         fp->currentfp = NULL;
106         if (hclose(prevfp) < 0) return -1;
107         goto open_next;
108     }
109 
110     return n;  // Number of bytes read by (or an error from) fp->currentfp
111 }
112 
multipart_write(hFILE * fpv,const void * buffer,size_t nbytes)113 static ssize_t multipart_write(hFILE *fpv, const void *buffer, size_t nbytes)
114 {
115     errno = EROFS;
116     return -1;
117 }
118 
multipart_seek(hFILE * fpv,off_t offset,int whence)119 static off_t multipart_seek(hFILE *fpv, off_t offset, int whence)
120 {
121     errno = ESPIPE;
122     return -1;
123 }
124 
multipart_close(hFILE * fpv)125 static int multipart_close(hFILE *fpv)
126 {
127     hFILE_multipart *fp = (hFILE_multipart *) fpv;
128 
129     free_all_parts(fp);
130     if (fp->currentfp) {
131         if (hclose(fp->currentfp) < 0) return -1;
132     }
133 
134     return 0;
135 }
136 
137 static const struct hFILE_backend multipart_backend =
138 {
139     multipart_read, multipart_write, multipart_seek, NULL, multipart_close
140 };
141 
142 // Returns 'v' (valid value), 'i' (invalid; required GA4GH field missing),
143 // or upon encountering an unexpected token, that token's type.
144 // Explicit `return '?'` means a JSON parsing error, typically a member key
145 // that is not a string.  An unexpected token may be a valid token that was
146 // not the type expected for a particular GA4GH field, or it may be '?' or
147 // '\0' which should be propagated.
148 static char
parse_ga4gh_body_json(hFILE_multipart * fp,hFILE * json,kstring_t * b,kstring_t * header)149 parse_ga4gh_body_json(hFILE_multipart *fp, hFILE *json,
150                       kstring_t *b, kstring_t *header)
151 {
152     hts_json_token t;
153 
154     if (hts_json_fnext(json, &t, b) != '{') return t.type;
155     while (hts_json_fnext(json, &t, b) != '}') {
156         if (t.type != 's') return '?';
157 
158         if (strcmp(t.str, "urls") == 0) {
159             if (hts_json_fnext(json, &t, b) != '[') return t.type;
160 
161             while (hts_json_fnext(json, &t, b) != ']') {
162                 hfile_part *part;
163                 size_t n = 0, max = 0;
164 
165                 hts_expand(hfile_part, fp->nparts+1, fp->maxparts, fp->parts);
166                 part = &fp->parts[fp->nparts++];
167                 part->url = NULL;
168                 part->headers = NULL;
169 
170                 if (t.type != '{') return t.type;
171                 while (hts_json_fnext(json, &t, b) != '}') {
172                     if (t.type != 's') return '?';
173 
174                     if (strcmp(t.str, "url") == 0) {
175                         if (hts_json_fnext(json, &t, b) != 's') return t.type;
176                         part->url = ks_release(b);
177                     }
178                     else if (strcmp(t.str, "headers") == 0) {
179                         if (hts_json_fnext(json, &t, b) != '{') return t.type;
180 
181                         while (hts_json_fnext(json, &t, header) != '}') {
182                             if (t.type != 's') return '?';
183 
184                             if (hts_json_fnext(json, &t, b) != 's')
185                                 return t.type;
186 
187                             kputs(": ", header);
188                             kputs(t.str, header);
189                             n++;
190                             hts_expand(char *, n+1, max, part->headers);
191                             part->headers[n-1] = ks_release(header);
192                             part->headers[n] = NULL;
193                         }
194                     }
195                     else if (hts_json_fskip_value(json, '\0') != 'v')
196                         return '?';
197                 }
198 
199                 if (! part->url) return 'i';
200             }
201         }
202         else if (strcmp(t.str, "format") == 0) {
203             if (hts_json_fnext(json, &t, b) != 's') return t.type;
204 
205             hts_log_debug("GA4GH JSON redirection to multipart %s data", t.str);
206         }
207         else if (hts_json_fskip_value(json, '\0') != 'v') return '?';
208     }
209 
210     return 'v';
211 }
212 
213 // Returns 'v' (valid value), 'i' (invalid; required GA4GH field missing),
214 // or upon encountering an unexpected token, that token's type.
215 // Explicit `return '?'` means a JSON parsing error, typically a member key
216 // that is not a string.  An unexpected token may be a valid token that was
217 // not the type expected for a particular GA4GH field, or it may be '?' or
218 // '\0' which should be propagated.
219 static char
parse_ga4gh_redirect_json(hFILE_multipart * fp,hFILE * json,kstring_t * b,kstring_t * header)220 parse_ga4gh_redirect_json(hFILE_multipart *fp, hFILE *json,
221                           kstring_t *b, kstring_t *header) {
222     hts_json_token t;
223 
224     if (hts_json_fnext(json, &t, b) != '{') return t.type;
225     while (hts_json_fnext(json, &t, b) != '}') {
226         if (t.type != 's') return '?';
227 
228         if (strcmp(t.str, "htsget") == 0) {
229             char ret = parse_ga4gh_body_json(fp, json, b, header);
230             if (ret != 'v') return ret;
231         }
232         else return '?';
233     }
234 
235     if (hts_json_fnext(json, &t, b) != '\0') return '?';
236 
237     return 'v';
238 }
239 
hopen_htsget_redirect(hFILE * hfile,const char * mode)240 hFILE *hopen_htsget_redirect(hFILE *hfile, const char *mode)
241 {
242     hFILE_multipart *fp;
243     kstring_t s1 = { 0, 0, NULL }, s2 = { 0, 0, NULL };
244     char ret;
245 
246     fp = (hFILE_multipart *) hfile_init(sizeof (hFILE_multipart), mode, 0);
247     if (fp == NULL) return NULL;
248 
249     fp->parts = NULL;
250     fp->nparts = fp->maxparts = 0;
251 
252     ret = parse_ga4gh_redirect_json(fp, hfile, &s1, &s2);
253     free(s1.s);
254     free(s2.s);
255     if (ret != 'v') {
256         free_all_parts(fp);
257         hfile_destroy((hFILE *) fp);
258         errno = (ret == '?' || ret == '\0')? EPROTO : EINVAL;
259         return NULL;
260     }
261 
262     fp->current = 0;
263     fp->currentfp = NULL;
264     fp->base.backend = &multipart_backend;
265     return &fp->base;
266 }
267