1 /* api.c - Copyright (c) 2018, Sijmen J. Mulder (see LICENSE.md) */
2
3 #define USERAGENT "nostt (+https://github.com/sjmulder/nostt)"
4 #define ENDPOINT "http://teletekst-data.nos.nl/json/"
5 #define SUBST_CHAR '%'
6
7 #include <stdio.h>
8 #include <string.h>
9 #include <ctype.h>
10 #include <wchar.h>
11 #include <curl/curl.h>
12 #include <json-c/json.h>
13 #include "api.h"
14
15 #define LEN(a) (sizeof(a)/sizeof(*(a)))
16
17 enum parsest {
18 PS_IN_TEXT,
19 PS_IN_TAG,
20 PS_IN_ATTRQUOTES
21 };
22
23 /* see jsonwrite() below */
24 struct jsonctx {
25 enum tterr err;
26 struct json_tokener *tokener;
27 struct json_object *object;
28 };
29
30 struct entity {
31 char seq[10];
32 wchar_t wc;
33 };
34
35 static char curlerrbuf[CURL_ERROR_SIZE];
36
37 /* indexed by ttcolor */
38 static const char *colornames[] = {
39 "black",
40 "red",
41 "green",
42 "yellow",
43 "blue",
44 "magenta",
45 "cyan",
46 "white"
47 };
48
49 static const struct entity entities[] = {
50 /* HTMLspecial */
51 { "&qout;", 0x22 }, { "&", 0x26 },
52 { "'", 0x27 }, { "<", 0x3C },
53 { ">", 0x3E },
54 /* HTMLlat1 */
55 { "Æ", 0xC6 }, { "Á", 0xC1 },
56 { "Â", 0xC2 }, { "À", 0xC0 },
57 { "Å", 0xC5 }, { "Ã", 0xC3 },
58 { "Ä", 0xC4 }, { "Ç", 0xC7 },
59 { "Ð", 0xD0 }, { "É", 0xC9 },
60 { "Ê", 0xCA }, { "È", 0xC8 },
61 { "Ë", 0xCB }, { "Í", 0xCD },
62 { "Î", 0xCE }, { "Ì", 0xCC },
63 { "Ï", 0xCF }, { "Ñ", 0xD1 },
64 { "Ó", 0xD3 }, { "Ô", 0xD4 },
65 { "Ò", 0xD2 }, { "Ø", 0xD8 },
66 { "Õ", 0xD5 }, { "Ö", 0xD6 },
67 { "Þ", 0xDE }, { "Ú", 0xDA },
68 { "Û", 0xDB }, { "Ù", 0xD9 },
69 { "Ü", 0xDC }, { "Ý", 0xDD },
70 { "á", 0xE1 }, { "â", 0xE2 },
71 { "æ", 0xE6 }, { "à", 0xE0 },
72 { "å", 0xE5 }, { "ã", 0xE3 },
73 { "ä", 0xE4 }, { "ç", 0xE7 },
74 { "é", 0xE9 }, { "ê", 0xEA },
75 { "è", 0xE8 }, { "ð", 0xF0 },
76 { "ë", 0xEB }, { "í", 0xED },
77 { "î", 0xEE }, { "ì", 0xEC },
78 { "ï", 0xEF }, { "ñ", 0xF1 },
79 { "ó", 0xF3 }, { "ô", 0xF4 },
80 { "ò", 0xF2 }, { "ø", 0xF8 },
81 { "õ", 0xF5 }, { "ö", 0xF6 },
82 { "ß", 0xDF }, { "þ", 0xFE },
83 { "ú", 0xFA }, { "û", 0xFB },
84 { "ù", 0xF9 }, { "ü", 0xFC },
85 { "ý", 0xFD }, { "ÿ", 0xFF }
86 };
87
88 static const struct ttattrs defattrs = {
89 /* fg */ TT_WHITE,
90 /* bg */ TT_BLACK
91 };
92
93 /* Callback for curl; directly forwards data to the JSON parser. */
94 static size_t
jsonwrite(char * ptr,size_t sz,size_t nmemb,struct jsonctx * ctx)95 jsonwrite(char *ptr, size_t sz, size_t nmemb, struct jsonctx *ctx)
96 {
97 enum json_tokener_error jsonerr;
98
99 if (ctx->err != TT_OK)
100 return 0;
101
102 if (ctx->object) {
103 ctx->err = TT_EDATA;
104 return 0;
105 }
106
107 ctx->object = json_tokener_parse_ex(ctx->tokener, ptr,
108 (int)(sz * nmemb));
109 if (!ctx->object) {
110 jsonerr = json_tokener_get_error(ctx->tokener);
111 if (jsonerr != json_tokener_continue)
112 ctx->err = TT_EDATA;
113 }
114
115 return sz * nmemb;
116 }
117
118 /* does nothing if no match */
119 static void
parsecolor(const char * str,const char * end,enum ttcolor * color)120 parsecolor(const char *str, const char *end, enum ttcolor *color)
121 {
122 size_t i, len;
123
124 for (i = 0; i < LEN(colornames); i++) {
125 len = strlen(colornames[i]);
126 if ((ptrdiff_t)len >= end - str &&
127 !strncmp(str, colornames[i], len)) {
128 *color = (enum ttcolor)i;
129 return;
130 }
131 }
132 }
133
134 /* Unescapes HTML entities. *endp will point to the first character past the
135 escape sequence. */
136 static wchar_t
unescape(const char * sequence,const char ** endp)137 unescape(const char *sequence, const char **endp)
138 {
139 wchar_t wc;
140 size_t len, i;
141
142 if (*sequence != '&') {
143 /* assign wc first in case sequence=*endp */
144 wc = *sequence;
145 *endp = sequence + 1;
146 return wc;
147 }
148
149 if (!strncmp("&#x", sequence, 3)) {
150 wc = (wchar_t)strtol(sequence+3, (char **)endp, 16);
151 if (**endp == ';')
152 (*endp)++;
153 return wc;
154 }
155
156 for (i = 0; i < LEN(entities); i++) {
157 len = strlen(entities[i].seq);
158 if (!strncmp(entities[i].seq, sequence, len)) {
159 *endp = sequence + len;
160 return entities[i].wc;
161 }
162 }
163
164 *endp = sequence + 1;
165 return '&';
166 }
167
168 /* parses class name lists like "red bg-white" into attrs->fg and attrs->bg,
169 or leaves them untouched if not specified */
170 static void
parsecolors(const char * str,const char * end,struct ttattrs * attrs)171 parsecolors(const char *str, const char *end, struct ttattrs *attrs)
172 {
173 enum ttcolor *color;
174 const char *wordend;
175
176 while (end - str > 3) {
177 if (memcmp("bg-", str, 3) == 0) {
178 color = &attrs->bg;
179 str += 3;
180 } else
181 color = &attrs->fg;
182
183 wordend = str+1;
184 while (wordend < end && !isspace(*wordend))
185 wordend++;
186
187 parsecolor(str, wordend, color);
188
189 str = wordend;
190 while (str < end && isspace(*str))
191 str++;
192 }
193 }
194
195 /* Very simple HTML parser. Only accepts the following sort of input:
196
197 <span class="red bg-white">NOS</span> TELETEKST
198 Nieuws <span class="cyan"><a href="#101">101</a></span>
199 Sport <span class="cyan"><a href="#102">102</a></span>
200
201 Every cell in the page is assigned, either with content, or with a space
202 character.
203
204 HTML element and attribute names themselves are ignored; if a tag contains
205 quotes it is assumed to be a class list. Any tag with a '/' in it is
206 considered a closing tag. Nesting is supported, but no self-closing tags
207 and such.
208
209 This may all seem horribly limited but it's only meant to parse the HTML
210 output from the API, which it does. */
211 static enum tterr
parse(const char * html,struct ttpage * page)212 parse(const char *html, struct ttpage *page)
213 {
214 const char *p;
215 wchar_t wc;
216 int line = 0;
217 int col = 0;
218 const char *openquote = NULL;
219 const char *closequote = NULL;
220 enum parsest state = PS_IN_TEXT;
221 struct ttattrs curattrs = defattrs;
222 struct ttattrs attrstack[8];
223 int attrdepth = 0;
224
225 attrstack[0] = defattrs;
226
227 p = html;
228 while (line < TT_NLINES) {
229 /* clear rest of line if EOL or EOF */
230 if (*p == '\0' || *p == '\n') {
231 if (*p == '\n')
232 p++;
233 while (col < TT_NCOLS) {
234 page->attrs[line][col] = defattrs;
235 page->chars[line][col] = L' ';
236 col++;
237 }
238 line++;
239 col = 0;
240 continue;
241 }
242
243 if (*p == '&') {
244 wc = unescape(p, &p);
245 p--; /* offset the p++ later on */
246 } else
247 wc = *p;
248
249 switch (state) {
250 case PS_IN_TEXT:
251 switch (wc) {
252 case '<':
253 state = PS_IN_TAG;
254 if (++attrdepth < (int)LEN(attrstack))
255 attrstack[attrdepth] = curattrs;
256 break;
257 default:
258 /* ignore input beyond line length */
259 if (col < TT_NCOLS) {
260 page->chars[line][col] = wc;
261 page->attrs[line][col] = curattrs;
262 col++;
263 }
264 break;
265 }
266 break;
267
268 case PS_IN_TAG:
269 switch (wc) {
270 case '/':
271 /* End tag, pop attrs. Twice, because we just
272 pushed on the start of this closing tag
273 too. */
274 if ((attrdepth -= 2) < 0)
275 attrdepth = 0;
276 else if (attrdepth < (int)LEN(attrstack))
277 curattrs = attrstack[attrdepth+1];
278 break;
279 case '"':
280 state = PS_IN_ATTRQUOTES;
281 openquote = p;
282 break;
283 case '>':
284 state = PS_IN_TEXT;
285 break;
286 }
287 break;
288
289 case PS_IN_ATTRQUOTES:
290 switch (wc) {
291 case '"':
292 state = PS_IN_TAG;
293 closequote = p;
294 break;
295 case '>':
296 state = PS_IN_TEXT;
297 closequote = p;
298 break;
299 }
300 if (state != PS_IN_ATTRQUOTES) {
301 /* we assume the attribute is 'class', so
302 parse the colors */
303 parsecolors(openquote+1, closequote,
304 &curattrs);
305 }
306 break;
307 }
308
309 p++;
310 }
311
312 return TT_OK;
313 }
314
315 enum tterr
tt_get(const char * id,struct ttpage * page)316 tt_get(const char *id, struct ttpage *page)
317 {
318 enum tterr err = TT_OK;
319 char url[128];
320 CURL *curl = NULL;
321 long status;
322 struct jsonctx json;
323 struct json_object *jval;
324 const char *html;
325 wchar_t *wcp;
326 int line, col;
327 struct curl_slist *list = NULL;
328
329 if (!id || !*id || !page)
330 return TT_EARG;
331
332 memset(&json, 0, sizeof(json));
333
334 snprintf(url, LEN(url), ENDPOINT "%s", id);
335 url[LEN(url)-1] = '\0';
336
337 json.tokener = json_tokener_new();
338
339 if (!(curl = curl_easy_init())) {
340 err = TT_ECURL;
341 goto cleanup;
342 }
343
344 curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curlerrbuf);
345 curl_easy_setopt(curl, CURLOPT_USERAGENT, USERAGENT);
346 curl_easy_setopt(curl, CURLOPT_URL, url);
347 curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, jsonwrite);
348 curl_easy_setopt(curl, CURLOPT_WRITEDATA, &json);
349
350 list = curl_slist_append(list, "Accept-Encoding: application/json");
351 curl_easy_setopt(curl, CURLOPT_HTTPHEADER, list);
352
353 if (curl_easy_perform(curl) != CURLE_OK) {
354 err = TT_ECURL;
355 goto cleanup;
356 }
357
358 curl_slist_free_all(list);
359
360 curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &status);
361 if (status < 200 || status >= 300) {
362 err = TT_EAPI;
363 goto cleanup;
364 }
365
366 if (!json.object ||
367 !json_object_is_type(json.object, json_type_object)) {
368 err = TT_EDATA;
369 goto cleanup;
370 }
371
372 strncpy(page->id, id, LEN(page->id)-1);
373 page->id[LEN(page->id)-1] = '\0';
374 page->nextpage[0] = '\0';
375 page->nextsub[0] = '\0';
376
377 if (json_object_object_get_ex(json.object, "nextPage", &jval)) {
378 strncpy(page->nextpage, json_object_get_string(jval),
379 LEN(page->nextpage)-1);
380 page->nextpage[LEN(page->nextpage)-1] = '\0';
381 }
382
383 if (json_object_object_get_ex(json.object, "nextSubPage", &jval)) {
384 strncpy(page->nextsub, json_object_get_string(jval),
385 LEN(page->nextsub)-1);
386 page->nextsub[LEN(page->nextsub)-1] = '\0';
387 }
388
389 if (!json_object_object_get_ex(json.object, "content", &jval) ||
390 !(html = json_object_get_string(jval))) {
391 err = TT_EDATA;
392 goto cleanup;
393 }
394
395 parse(html, page);
396
397 /* Map block drawing characters */
398 for (line = 0; line < TT_NLINES; line++) {
399 for (col = 0; col < TT_NCOLS; col++) {
400 wcp = &page->chars[line][col];
401 if (*wcp >= 0xF000)
402 *wcp = SUBST_CHAR;
403 }
404 }
405
406 cleanup:
407 if (curl)
408 curl_easy_cleanup(curl);
409 if (json.tokener)
410 json_tokener_free(json.tokener);
411
412 return err;
413 }
414
415 const char *
tt_errstr(enum tterr err)416 tt_errstr(enum tterr err)
417 {
418 switch (err) {
419 case TT_OK: return "no error";
420 case TT_EARG: return "invalid argument";
421 case TT_ECURL: return curlerrbuf;
422 case TT_EAPI: return "API returned an error code";
423 case TT_EDATA: return "API returned invalid or unexpected data";
424 default: return "unknown error";
425 }
426 }
427