1 /* api.c - Copyright (c) 2018, Sijmen J. Mulder (see LICENSE.md) */
2 
3 #define USERAGENT	"nostt (+https://github.com/sjmulder/nostt)"
4 #define ENDPOINT	"http://teletekst-data.nos.nl/json/"
5 #define SUBST_CHAR	'%'
6 
7 #include <stdio.h>
8 #include <string.h>
9 #include <ctype.h>
10 #include <wchar.h>
11 #include <curl/curl.h>
12 #include <json-c/json.h>
13 #include "api.h"
14 
15 #define LEN(a) (sizeof(a)/sizeof(*(a)))
16 
17 enum parsest {
18 	PS_IN_TEXT,
19 	PS_IN_TAG,
20 	PS_IN_ATTRQUOTES
21 };
22 
23 /* see jsonwrite() below */
24 struct jsonctx {
25 	enum tterr		 err;
26 	struct json_tokener	*tokener;
27 	struct json_object	*object;
28 };
29 
30 struct entity {
31 	char	seq[10];
32 	wchar_t	wc;
33 };
34 
35 static char curlerrbuf[CURL_ERROR_SIZE];
36 
37 /* indexed by ttcolor */
38 static const char *colornames[] = {
39 	"black",
40 	"red",
41 	"green",
42 	"yellow",
43 	"blue",
44 	"magenta",
45 	"cyan",
46 	"white"
47 };
48 
49 static const struct entity entities[] = {
50 	/* HTMLspecial */
51 	{ "&qout;",	0x22 }, { "&amp;",	0x26 },
52 	{ "&apos;",	0x27 }, { "&lt;",	0x3C },
53 	{ "&gt;",	0x3E },
54 	/* HTMLlat1 */
55 	{ "&AElig;",	0xC6 }, { "&Aacute;",	0xC1 },
56 	{ "&Acirc;",	0xC2 }, { "&Agrave;",	0xC0 },
57 	{ "&Aring;",	0xC5 }, { "&Atilde;",	0xC3 },
58 	{ "&Auml;",	0xC4 }, { "&Ccedil;",	0xC7 },
59 	{ "&ETH;",	0xD0 }, { "&Eacute;",	0xC9 },
60 	{ "&Ecirc;",	0xCA }, { "&Egrave;",	0xC8 },
61 	{ "&Euml;",	0xCB }, { "&Iacute;",	0xCD },
62 	{ "&Icirc;",	0xCE }, { "&Igrave;",	0xCC },
63 	{ "&Iuml;",	0xCF }, { "&Ntilde;",	0xD1 },
64 	{ "&Oacute;",	0xD3 }, { "&Ocirc;",	0xD4 },
65 	{ "&Ograve;",	0xD2 }, { "&Oslash;",	0xD8 },
66 	{ "&Otilde;",	0xD5 }, { "&Ouml;",	0xD6 },
67 	{ "&THORN;",	0xDE }, { "&Uacute;",	0xDA },
68 	{ "&Ucirc;",	0xDB }, { "&Ugrave;",	0xD9 },
69 	{ "&Uuml;",	0xDC }, { "&Yacute;",	0xDD },
70 	{ "&aacute;",	0xE1 }, { "&acirc;",	0xE2 },
71 	{ "&aelig;",	0xE6 }, { "&agrave;",	0xE0 },
72 	{ "&aring;",	0xE5 }, { "&atilde;",	0xE3 },
73 	{ "&auml;",	0xE4 }, { "&ccedil;",	0xE7 },
74 	{ "&eacute;",	0xE9 }, { "&ecirc;",	0xEA },
75 	{ "&egrave;",	0xE8 }, { "&eth;",	0xF0 },
76 	{ "&euml;",	0xEB }, { "&iacute;",	0xED },
77 	{ "&icirc;",	0xEE }, { "&igrave;",	0xEC },
78 	{ "&iuml;",	0xEF }, { "&ntilde;",	0xF1 },
79 	{ "&oacute;",	0xF3 }, { "&ocirc;",	0xF4 },
80 	{ "&ograve;",	0xF2 }, { "&oslash;",	0xF8 },
81 	{ "&otilde;",	0xF5 }, { "&ouml;",	0xF6 },
82 	{ "&szlig;",	0xDF }, { "&thorn;",	0xFE },
83 	{ "&uacute;",	0xFA }, { "&ucirc;",	0xFB },
84 	{ "&ugrave;",	0xF9 }, { "&uuml;",	0xFC },
85 	{ "&yacute;",	0xFD }, { "&yuml;",	0xFF }
86 };
87 
88 static const struct ttattrs defattrs = {
89 	/* fg */	TT_WHITE,
90 	/* bg */	TT_BLACK
91 };
92 
93 /* Callback for curl; directly forwards data to the JSON parser. */
94 static size_t
jsonwrite(char * ptr,size_t sz,size_t nmemb,struct jsonctx * ctx)95 jsonwrite(char *ptr, size_t sz, size_t nmemb, struct jsonctx *ctx)
96 {
97 	enum json_tokener_error jsonerr;
98 
99 	if (ctx->err != TT_OK)
100 		return 0;
101 
102 	if (ctx->object) {
103 		ctx->err = TT_EDATA;
104 		return 0;
105 	}
106 
107 	ctx->object = json_tokener_parse_ex(ctx->tokener, ptr,
108 	    (int)(sz * nmemb));
109 	if (!ctx->object) {
110 		jsonerr = json_tokener_get_error(ctx->tokener);
111 		if (jsonerr != json_tokener_continue)
112 			ctx->err = TT_EDATA;
113 	}
114 
115 	return sz * nmemb;
116 }
117 
118 /* does nothing if no match */
119 static void
parsecolor(const char * str,const char * end,enum ttcolor * color)120 parsecolor(const char *str, const char *end, enum ttcolor *color)
121 {
122 	size_t	i, len;
123 
124 	for (i = 0; i < LEN(colornames); i++) {
125 		len = strlen(colornames[i]);
126 		if ((ptrdiff_t)len >= end - str &&
127 		    !strncmp(str, colornames[i], len)) {
128 			*color = (enum ttcolor)i;
129 			return;
130 		}
131 	}
132 }
133 
134 /* Unescapes HTML entities. *endp will point to the first character past the
135    escape sequence. */
136 static wchar_t
unescape(const char * sequence,const char ** endp)137 unescape(const char *sequence, const char **endp)
138 {
139 	wchar_t	wc;
140 	size_t	len, i;
141 
142 	if (*sequence != '&') {
143 		/* assign wc first in case sequence=*endp */
144 		wc = *sequence;
145 		*endp = sequence + 1;
146 		return wc;
147 	}
148 
149 	if (!strncmp("&#x", sequence, 3)) {
150 		wc = (wchar_t)strtol(sequence+3, (char **)endp, 16);
151 		if (**endp == ';')
152 			(*endp)++;
153 		return wc;
154 	}
155 
156 	for (i = 0; i < LEN(entities); i++) {
157 		len = strlen(entities[i].seq);
158 		if (!strncmp(entities[i].seq, sequence, len)) {
159 			*endp = sequence + len;
160 			return entities[i].wc;
161 		}
162 	}
163 
164 	*endp = sequence + 1;
165 	return '&';
166 }
167 
168 /* parses class name lists like "red bg-white" into attrs->fg and attrs->bg,
169    or leaves them untouched if not specified */
170 static void
parsecolors(const char * str,const char * end,struct ttattrs * attrs)171 parsecolors(const char *str, const char *end, struct ttattrs *attrs)
172 {
173 	enum ttcolor	*color;
174 	const char	*wordend;
175 
176 	while (end - str > 3) {
177 		if (memcmp("bg-", str, 3) == 0) {
178 			color = &attrs->bg;
179 			str += 3;
180 		} else
181 			color = &attrs->fg;
182 
183 		wordend = str+1;
184 		while (wordend < end && !isspace(*wordend))
185 			wordend++;
186 
187 		parsecolor(str, wordend, color);
188 
189 		str = wordend;
190 		while (str < end && isspace(*str))
191 			str++;
192 	}
193 }
194 
195 /* Very simple HTML parser. Only accepts the following sort of input:
196 
197      <span class="red bg-white">NOS</span> TELETEKST
198      Nieuws  <span class="cyan"><a href="#101">101</a></span>
199      Sport   <span class="cyan"><a href="#102">102</a></span>
200 
201    Every cell in the page is assigned, either with content, or with a space
202    character.
203 
204    HTML element and attribute names themselves are ignored; if a tag contains
205    quotes it is assumed to be a class list. Any tag with a '/' in it is
206    considered a closing tag. Nesting is supported, but no self-closing tags
207    and such.
208 
209    This may all seem horribly limited but it's only meant to parse the HTML
210    output from the API, which it does. */
211 static enum tterr
parse(const char * html,struct ttpage * page)212 parse(const char *html, struct ttpage *page)
213 {
214 	const char	*p;
215 	wchar_t		 wc;
216 	int		 line		= 0;
217 	int		 col		= 0;
218 	const char	*openquote	= NULL;
219 	const char	*closequote	= NULL;
220 	enum parsest	 state		= PS_IN_TEXT;
221 	struct ttattrs	 curattrs	= defattrs;
222 	struct ttattrs	 attrstack[8];
223 	int		 attrdepth	= 0;
224 
225 	attrstack[0] = defattrs;
226 
227 	p = html;
228 	while (line < TT_NLINES) {
229 		/* clear rest of line if EOL or EOF */
230 		if (*p == '\0' || *p == '\n') {
231 			if (*p == '\n')
232 				p++;
233 			while (col < TT_NCOLS) {
234 				page->attrs[line][col] = defattrs;
235 				page->chars[line][col] = L' ';
236 				col++;
237 			}
238 			line++;
239 			col = 0;
240 			continue;
241 		}
242 
243 		if (*p == '&') {
244 			wc = unescape(p, &p);
245 			p--; /* offset the p++ later on */
246 		} else
247 			wc = *p;
248 
249 		switch (state) {
250 		case PS_IN_TEXT:
251 			switch (wc) {
252 			case '<':
253 				state = PS_IN_TAG;
254 				if (++attrdepth < (int)LEN(attrstack))
255 					attrstack[attrdepth] = curattrs;
256 				break;
257 			default:
258 				/* ignore input beyond line length */
259 				if (col < TT_NCOLS) {
260 					page->chars[line][col] = wc;
261 					page->attrs[line][col] = curattrs;
262 					col++;
263 				}
264 				break;
265 			}
266 			break;
267 
268 		case PS_IN_TAG:
269 			switch (wc) {
270 			case '/':
271 				/* End tag, pop attrs. Twice, because we just
272 				   pushed on the start of this closing tag
273 				   too. */
274 				if ((attrdepth -= 2) < 0)
275 					attrdepth = 0;
276 				else if (attrdepth < (int)LEN(attrstack))
277 					curattrs = attrstack[attrdepth+1];
278 				break;
279 			case '"':
280 				state = PS_IN_ATTRQUOTES;
281 				openquote = p;
282 				break;
283 			case '>':
284 				state = PS_IN_TEXT;
285 				break;
286 			}
287 			break;
288 
289 		case PS_IN_ATTRQUOTES:
290 			switch (wc) {
291 			case '"':
292 				state = PS_IN_TAG;
293 				closequote = p;
294 				break;
295 			case '>':
296 				state = PS_IN_TEXT;
297 				closequote = p;
298 				break;
299 			}
300 			if (state != PS_IN_ATTRQUOTES) {
301 				/* we assume the attribute is 'class', so
302 				   parse the colors */
303 				parsecolors(openquote+1, closequote,
304 				    &curattrs);
305 			}
306 			break;
307 		}
308 
309 		p++;
310 	}
311 
312 	return TT_OK;
313 }
314 
315 enum tterr
tt_get(const char * id,struct ttpage * page)316 tt_get(const char *id, struct ttpage *page)
317 {
318 	enum tterr		 err	= TT_OK;
319 	char			 url[128];
320 	CURL			*curl	= NULL;
321 	long			 status;
322 	struct jsonctx		 json;
323 	struct json_object	*jval;
324 	const char		*html;
325 	wchar_t			*wcp;
326 	int			 line, col;
327 	struct curl_slist	*list	= NULL;
328 
329 	if (!id || !*id || !page)
330 		return TT_EARG;
331 
332 	memset(&json, 0, sizeof(json));
333 
334 	snprintf(url, LEN(url), ENDPOINT "%s", id);
335 	url[LEN(url)-1] = '\0';
336 
337 	json.tokener = json_tokener_new();
338 
339 	if (!(curl = curl_easy_init())) {
340 		err = TT_ECURL;
341 		goto cleanup;
342 	}
343 
344 	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curlerrbuf);
345 	curl_easy_setopt(curl, CURLOPT_USERAGENT, USERAGENT);
346 	curl_easy_setopt(curl, CURLOPT_URL, url);
347 	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, jsonwrite);
348 	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &json);
349 
350 	list = curl_slist_append(list, "Accept-Encoding: application/json");
351 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, list);
352 
353 	if (curl_easy_perform(curl) != CURLE_OK) {
354 		err = TT_ECURL;
355 		goto cleanup;
356 	}
357 
358 	curl_slist_free_all(list);
359 
360 	curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &status);
361 	if (status < 200 || status >= 300) {
362 		err = TT_EAPI;
363 		goto cleanup;
364 	}
365 
366 	if (!json.object ||
367 	    !json_object_is_type(json.object, json_type_object)) {
368 		err = TT_EDATA;
369 		goto cleanup;
370 	}
371 
372 	strncpy(page->id, id, LEN(page->id)-1);
373 	page->id[LEN(page->id)-1] = '\0';
374 	page->nextpage[0] = '\0';
375 	page->nextsub[0] = '\0';
376 
377 	if (json_object_object_get_ex(json.object, "nextPage", &jval)) {
378 		strncpy(page->nextpage, json_object_get_string(jval),
379 		    LEN(page->nextpage)-1);
380 		page->nextpage[LEN(page->nextpage)-1] = '\0';
381 	}
382 
383 	if (json_object_object_get_ex(json.object, "nextSubPage", &jval)) {
384 		strncpy(page->nextsub, json_object_get_string(jval),
385 		    LEN(page->nextsub)-1);
386 		page->nextsub[LEN(page->nextsub)-1] = '\0';
387 	}
388 
389 	if (!json_object_object_get_ex(json.object, "content", &jval) ||
390 	    !(html = json_object_get_string(jval))) {
391 		err = TT_EDATA;
392 		goto cleanup;
393 	}
394 
395 	parse(html, page);
396 
397 	/* Map block drawing characters */
398 	for (line = 0; line < TT_NLINES; line++) {
399 		for (col = 0; col < TT_NCOLS; col++) {
400 			wcp = &page->chars[line][col];
401 			if (*wcp >= 0xF000)
402 				*wcp = SUBST_CHAR;
403 		}
404 	}
405 
406 cleanup:
407 	if (curl)
408 		curl_easy_cleanup(curl);
409 	if (json.tokener)
410 		json_tokener_free(json.tokener);
411 
412 	return err;
413 }
414 
415 const char *
tt_errstr(enum tterr err)416 tt_errstr(enum tterr err)
417 {
418 	switch (err) {
419 	case TT_OK:	return "no error";
420 	case TT_EARG:	return "invalid argument";
421 	case TT_ECURL:	return curlerrbuf;
422 	case TT_EAPI:	return "API returned an error code";
423 	case TT_EDATA:	return "API returned invalid or unexpected data";
424 	default:	return "unknown error";
425 	}
426 }
427