1 /*
2 * native ebml reader for the Matroska demuxer
3 * new parser copyright (c) 2010 Uoti Urpala
4 * copyright (c) 2004 Aurelien Jacobs <aurel@gnuage.org>
5 * based on the one written by Ronald Bultje for gstreamer
6 *
7 * This file is part of mpv.
8 *
9 * mpv is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * mpv is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with mpv. If not, see <http://www.gnu.org/licenses/>.
21 */
22
23 #include "config.h"
24
25 #include <stdlib.h>
26 #include <stdbool.h>
27 #include <inttypes.h>
28 #include <stddef.h>
29 #include <assert.h>
30
31 #include <libavutil/intfloat.h>
32 #include <libavutil/common.h>
33 #include "mpv_talloc.h"
34 #include "ebml.h"
35 #include "stream/stream.h"
36 #include "common/msg.h"
37
38 // Whether the id is a known Matroska level 1 element (allowed as element on
39 // global file level, after the level 0 MATROSKA_ID_SEGMENT).
40 // This (intentionally) doesn't include "global" elements.
ebml_is_mkv_level1_id(uint32_t id)41 bool ebml_is_mkv_level1_id(uint32_t id)
42 {
43 switch (id) {
44 case MATROSKA_ID_SEEKHEAD:
45 case MATROSKA_ID_INFO:
46 case MATROSKA_ID_CLUSTER:
47 case MATROSKA_ID_TRACKS:
48 case MATROSKA_ID_CUES:
49 case MATROSKA_ID_ATTACHMENTS:
50 case MATROSKA_ID_CHAPTERS:
51 case MATROSKA_ID_TAGS:
52 return true;
53 default:
54 return false;
55 }
56 }
57
58 /*
59 * Read: the element content data ID.
60 * Return: the ID.
61 */
ebml_read_id(stream_t * s)62 uint32_t ebml_read_id(stream_t *s)
63 {
64 int i, len_mask = 0x80;
65 uint32_t id;
66
67 for (i = 0, id = stream_read_char(s); i < 4 && !(id & len_mask); i++)
68 len_mask >>= 1;
69 if (i >= 4)
70 return EBML_ID_INVALID;
71 while (i--)
72 id = (id << 8) | stream_read_char(s);
73 return id;
74 }
75
76 /*
77 * Read: element content length.
78 */
ebml_read_length(stream_t * s)79 uint64_t ebml_read_length(stream_t *s)
80 {
81 int i, j, num_ffs = 0, len_mask = 0x80;
82 uint64_t len;
83
84 for (i = 0, len = stream_read_char(s); i < 8 && !(len & len_mask); i++)
85 len_mask >>= 1;
86 if (i >= 8)
87 return EBML_UINT_INVALID;
88 j = i + 1;
89 if ((int) (len &= (len_mask - 1)) == len_mask - 1)
90 num_ffs++;
91 while (i--) {
92 len = (len << 8) | stream_read_char(s);
93 if ((len & 0xFF) == 0xFF)
94 num_ffs++;
95 }
96 if (j == num_ffs)
97 return EBML_UINT_INVALID;
98 if (len >= 1ULL<<63) // Can happen if stream_read_char returns EOF
99 return EBML_UINT_INVALID;
100 return len;
101 }
102
103
104 /*
105 * Read a variable length signed int.
106 */
ebml_read_signed_length(stream_t * s)107 int64_t ebml_read_signed_length(stream_t *s)
108 {
109 uint64_t unum;
110 int l;
111
112 /* read as unsigned number first */
113 uint64_t offset = stream_tell(s);
114 unum = ebml_read_length(s);
115 if (unum == EBML_UINT_INVALID)
116 return EBML_INT_INVALID;
117 l = stream_tell(s) - offset;
118
119 return unum - ((1LL << ((7 * l) - 1)) - 1);
120 }
121
122 /*
123 * Read the next element as an unsigned int.
124 */
ebml_read_uint(stream_t * s)125 uint64_t ebml_read_uint(stream_t *s)
126 {
127 uint64_t len, value = 0;
128
129 len = ebml_read_length(s);
130 if (len == EBML_UINT_INVALID || len > 8)
131 return EBML_UINT_INVALID;
132
133 while (len--)
134 value = (value << 8) | stream_read_char(s);
135
136 return value;
137 }
138
139 /*
140 * Read the next element as a signed int.
141 */
ebml_read_int(stream_t * s)142 int64_t ebml_read_int(stream_t *s)
143 {
144 uint64_t value = 0;
145 uint64_t len;
146 int l;
147
148 len = ebml_read_length(s);
149 if (len == EBML_UINT_INVALID || len > 8)
150 return EBML_INT_INVALID;
151 if (!len)
152 return 0;
153
154 len--;
155 l = stream_read_char(s);
156 if (l & 0x80)
157 value = -1;
158 value = (value << 8) | l;
159 while (len--)
160 value = (value << 8) | stream_read_char(s);
161
162 return (int64_t)value; // assume complement of 2
163 }
164
165 /*
166 * Skip the current element.
167 * end: the end of the parent element or -1 (for robust error handling)
168 */
ebml_read_skip(struct mp_log * log,int64_t end,stream_t * s)169 int ebml_read_skip(struct mp_log *log, int64_t end, stream_t *s)
170 {
171 uint64_t len;
172
173 int64_t pos = stream_tell(s);
174
175 len = ebml_read_length(s);
176 if (len == EBML_UINT_INVALID)
177 goto invalid;
178
179 int64_t pos2 = stream_tell(s);
180 if (len >= INT64_MAX - pos2 || (end > 0 && pos2 + len > end))
181 goto invalid;
182
183 if (!stream_seek_skip(s, pos2 + len))
184 goto invalid;
185
186 return 0;
187
188 invalid:
189 mp_err(log, "Invalid EBML length at position %"PRId64"\n", pos);
190 stream_seek_skip(s, pos);
191 return 1;
192 }
193
194 /*
195 * Skip to (probable) next cluster (MATROSKA_ID_CLUSTER) element start position.
196 */
ebml_resync_cluster(struct mp_log * log,stream_t * s)197 int ebml_resync_cluster(struct mp_log *log, stream_t *s)
198 {
199 int64_t pos = stream_tell(s);
200 uint32_t last_4_bytes = 0;
201 stream_read_peek(s, &(char){0}, 1);
202 if (!s->eof) {
203 mp_err(log, "Corrupt file detected. "
204 "Trying to resync starting from position %"PRId64"...\n", pos);
205 }
206 while (!s->eof) {
207 // Assumes MATROSKA_ID_CLUSTER is 4 bytes, with no 0 bytes.
208 if (last_4_bytes == MATROSKA_ID_CLUSTER) {
209 mp_err(log, "Cluster found at %"PRId64".\n", pos - 4);
210 stream_seek(s, pos - 4);
211 return 0;
212 }
213 last_4_bytes = (last_4_bytes << 8) | stream_read_char(s);
214 pos++;
215 }
216 return -1;
217 }
218
219
220
221 #define EVALARGS(F, ...) F(__VA_ARGS__)
222 #define E(str, N, type) const struct ebml_elem_desc ebml_ ## N ## _desc = { str, type };
223 #define E_SN(str, count, N) const struct ebml_elem_desc ebml_ ## N ## _desc = { str, EBML_TYPE_SUBELEMENTS, sizeof(struct ebml_ ## N), count, (const struct ebml_field_desc[]){
224 #define E_S(str, count) EVALARGS(E_SN, str, count, N)
225 #define FN(id, name, multiple, N) { id, multiple, offsetof(struct ebml_ ## N, name), offsetof(struct ebml_ ## N, n_ ## name), &ebml_##name##_desc},
226 #define F(id, name, multiple) EVALARGS(FN, id, name, multiple, N)
227 #include "generated/ebml_defs.c"
228 #undef EVALARGS
229 #undef SN
230 #undef S
231 #undef FN
232 #undef F
233
234 // Used to read/write pointers to different struct types
235 struct generic;
236 #define generic_struct struct generic
237
ebml_parse_id(uint8_t * data,size_t data_len,int * length)238 static uint32_t ebml_parse_id(uint8_t *data, size_t data_len, int *length)
239 {
240 *length = -1;
241 uint8_t *end = data + data_len;
242 if (data == end)
243 return EBML_ID_INVALID;
244 int len = 1;
245 uint32_t id = *data++;
246 for (int len_mask = 0x80; !(id & len_mask); len_mask >>= 1) {
247 len++;
248 if (len > 4)
249 return EBML_ID_INVALID;
250 }
251 *length = len;
252 while (--len && data < end)
253 id = (id << 8) | *data++;
254 return id;
255 }
256
ebml_parse_length(uint8_t * data,size_t data_len,int * length)257 static uint64_t ebml_parse_length(uint8_t *data, size_t data_len, int *length)
258 {
259 *length = -1;
260 uint8_t *end = data + data_len;
261 if (data == end)
262 return -1;
263 uint64_t r = *data++;
264 int len = 1;
265 int len_mask;
266 for (len_mask = 0x80; !(r & len_mask); len_mask >>= 1) {
267 len++;
268 if (len > 8)
269 return -1;
270 }
271 r &= len_mask - 1;
272
273 int num_allones = 0;
274 if (r == len_mask - 1)
275 num_allones++;
276 for (int i = 1; i < len; i++) {
277 if (data == end)
278 return -1;
279 if (*data == 255)
280 num_allones++;
281 r = (r << 8) | *data++;
282 }
283 // According to Matroska specs this means "unknown length"
284 // Could be supported if there are any actual files using it
285 if (num_allones == len)
286 return -1;
287 *length = len;
288 return r;
289 }
290
ebml_parse_uint(uint8_t * data,int length)291 static uint64_t ebml_parse_uint(uint8_t *data, int length)
292 {
293 assert(length >= 0 && length <= 8);
294 uint64_t r = 0;
295 while (length--)
296 r = (r << 8) + *data++;
297 return r;
298 }
299
ebml_parse_sint(uint8_t * data,int length)300 static int64_t ebml_parse_sint(uint8_t *data, int length)
301 {
302 assert(length >= 0 && length <= 8);
303 if (!length)
304 return 0;
305 uint64_t r = 0;
306 if (*data & 0x80)
307 r = -1;
308 while (length--)
309 r = (r << 8) | *data++;
310 return (int64_t)r; // assume complement of 2
311 }
312
ebml_parse_float(uint8_t * data,int length)313 static double ebml_parse_float(uint8_t *data, int length)
314 {
315 assert(length == 0 || length == 4 || length == 8);
316 uint64_t i = ebml_parse_uint(data, length);
317 if (length == 4)
318 return av_int2float(i);
319 else
320 return av_int2double(i);
321 }
322
323
324 // target must be initialized to zero
ebml_parse_element(struct ebml_parse_ctx * ctx,void * target,uint8_t * data,int size,const struct ebml_elem_desc * type,int level)325 static void ebml_parse_element(struct ebml_parse_ctx *ctx, void *target,
326 uint8_t *data, int size,
327 const struct ebml_elem_desc *type, int level)
328 {
329 assert(type->type == EBML_TYPE_SUBELEMENTS);
330 assert(level < 8);
331 MP_TRACE(ctx, "%.*sParsing element %s\n", level, " ", type->name);
332
333 char *s = target;
334 uint8_t *end = data + size;
335 uint8_t *p = data;
336 int num_elems[MAX_EBML_SUBELEMENTS] = {0};
337 while (p < end) {
338 uint8_t *startp = p;
339 int len;
340 uint32_t id = ebml_parse_id(p, end - p, &len);
341 if (len > end - p)
342 goto past_end_error;
343 if (len < 0) {
344 MP_ERR(ctx, "Error parsing subelement id\n");
345 goto other_error;
346 }
347 p += len;
348 uint64_t length = ebml_parse_length(p, end - p, &len);
349 if (len > end - p)
350 goto past_end_error;
351 if (len < 0) {
352 MP_ERR(ctx, "Error parsing subelement length\n");
353 goto other_error;
354 }
355 p += len;
356
357 int field_idx = -1;
358 for (int i = 0; i < type->field_count; i++)
359 if (type->fields[i].id == id) {
360 field_idx = i;
361 num_elems[i]++;
362 if (num_elems[i] >= 0x70000000) {
363 MP_ERR(ctx, "Too many EBML subelements.\n");
364 goto other_error;
365 }
366 break;
367 }
368
369 if (length > end - p) {
370 if (field_idx >= 0 && type->fields[field_idx].desc->type
371 != EBML_TYPE_SUBELEMENTS) {
372 MP_ERR(ctx, "Subelement content goes "
373 "past end of containing element\n");
374 goto other_error;
375 }
376 // Try to parse what is possible from inside this partial element
377 ctx->has_errors = true;
378 length = end - p;
379 }
380 p += length;
381
382 continue;
383
384 past_end_error:
385 MP_ERR(ctx, "Subelement headers go past end of containing element\n");
386 other_error:
387 ctx->has_errors = true;
388 end = startp;
389 break;
390 }
391
392 for (int i = 0; i < type->field_count; i++) {
393 if (num_elems[i] && type->fields[i].multiple) {
394 char *ptr = s + type->fields[i].offset;
395 switch (type->fields[i].desc->type) {
396 case EBML_TYPE_SUBELEMENTS: {
397 size_t max = 1000000000 / type->fields[i].desc->size;
398 if (num_elems[i] > max) {
399 MP_ERR(ctx, "Too many subelements.\n");
400 num_elems[i] = max;
401 }
402 int sz = num_elems[i] * type->fields[i].desc->size;
403 *(generic_struct **) ptr = talloc_zero_size(ctx->talloc_ctx, sz);
404 break;
405 }
406 case EBML_TYPE_UINT:
407 *(uint64_t **) ptr = talloc_zero_array(ctx->talloc_ctx,
408 uint64_t, num_elems[i]);
409 break;
410 case EBML_TYPE_SINT:
411 *(int64_t **) ptr = talloc_zero_array(ctx->talloc_ctx,
412 int64_t, num_elems[i]);
413 break;
414 case EBML_TYPE_FLOAT:
415 *(double **) ptr = talloc_zero_array(ctx->talloc_ctx,
416 double, num_elems[i]);
417 break;
418 case EBML_TYPE_STR:
419 *(char ***) ptr = talloc_zero_array(ctx->talloc_ctx,
420 char *, num_elems[i]);
421 break;
422 case EBML_TYPE_BINARY:
423 *(struct bstr **) ptr = talloc_zero_array(ctx->talloc_ctx,
424 struct bstr,
425 num_elems[i]);
426 break;
427 case EBML_TYPE_EBML_ID:
428 *(int32_t **) ptr = talloc_zero_array(ctx->talloc_ctx,
429 uint32_t, num_elems[i]);
430 break;
431 default:
432 abort();
433 }
434 }
435 }
436
437 while (data < end) {
438 int len;
439 uint32_t id = ebml_parse_id(data, end - data, &len);
440 if (len < 0 || len > end - data) {
441 MP_ERR(ctx, "Error parsing subelement\n");
442 break;
443 }
444 data += len;
445 uint64_t length = ebml_parse_length(data, end - data, &len);
446 if (len < 0 || len > end - data) {
447 MP_ERR(ctx, "Error parsing subelement length\n");
448 break;
449 }
450 data += len;
451 if (length > end - data) {
452 // Try to parse what is possible from inside this partial element
453 length = end - data;
454 MP_ERR(ctx, "Next subelement content goes "
455 "past end of containing element, will be truncated\n");
456 }
457 int field_idx = -1;
458 for (int i = 0; i < type->field_count; i++)
459 if (type->fields[i].id == id) {
460 field_idx = i;
461 break;
462 }
463 if (field_idx < 0) {
464 if (id == 0xec) {
465 MP_TRACE(ctx, "%.*sIgnoring Void element "
466 "size: %"PRIu64"\n", level+1, " ", length);
467 } else if (id == 0xbf) {
468 MP_TRACE(ctx, "%.*sIgnoring CRC-32 "
469 "element size: %"PRIu64"\n", level+1, " ",
470 length);
471 } else {
472 MP_DBG(ctx, "Ignoring unrecognized "
473 "subelement. ID: %x size: %"PRIu64"\n", id, length);
474 }
475 data += length;
476 continue;
477 }
478 const struct ebml_field_desc *fd = &type->fields[field_idx];
479 const struct ebml_elem_desc *ed = fd->desc;
480 bool multiple = fd->multiple;
481 int *countptr = (int *) (s + fd->count_offset);
482 if (*countptr >= num_elems[field_idx]) {
483 // Shouldn't happen on any sane file without bugs
484 MP_ERR(ctx, "Too many subelements.\n");
485 ctx->has_errors = true;
486 data += length;
487 continue;
488 }
489 if (*countptr > 0 && !multiple) {
490 MP_WARN(ctx, "Another subelement of type "
491 "%x %s (size: %"PRIu64"). Only one allowed. Ignoring.\n",
492 id, ed->name, length);
493 ctx->has_errors = true;
494 data += length;
495 continue;
496 }
497 MP_TRACE(ctx, "%.*sParsing %x %s size: %"PRIu64
498 " value: ", level+1, " ", id, ed->name, length);
499
500 char *fieldptr = s + fd->offset;
501 switch (ed->type) {
502 case EBML_TYPE_SUBELEMENTS:
503 MP_TRACE(ctx, "subelements\n");
504 char *subelptr;
505 if (multiple) {
506 char *array_start = (char *) *(generic_struct **) fieldptr;
507 subelptr = array_start + *countptr * ed->size;
508 } else
509 subelptr = fieldptr;
510 ebml_parse_element(ctx, subelptr, data, length, ed, level + 1);
511 break;
512
513 case EBML_TYPE_UINT:;
514 uint64_t *uintptr;
515 #define GETPTR(subelptr, fieldtype) \
516 if (multiple) \
517 subelptr = *(fieldtype **) fieldptr + *countptr; \
518 else \
519 subelptr = (fieldtype *) fieldptr
520 GETPTR(uintptr, uint64_t);
521 if (length < 1 || length > 8) {
522 MP_ERR(ctx, "uint invalid length %"PRIu64"\n", length);
523 goto error;
524 }
525 *uintptr = ebml_parse_uint(data, length);
526 MP_TRACE(ctx, "uint %"PRIu64"\n", *uintptr);
527 break;
528
529 case EBML_TYPE_SINT:;
530 int64_t *sintptr;
531 GETPTR(sintptr, int64_t);
532 if (length > 8) {
533 MP_ERR(ctx, "sint invalid length %"PRIu64"\n", length);
534 goto error;
535 }
536 *sintptr = ebml_parse_sint(data, length);
537 MP_TRACE(ctx, "sint %"PRId64"\n", *sintptr);
538 break;
539
540 case EBML_TYPE_FLOAT:;
541 double *floatptr;
542 GETPTR(floatptr, double);
543 if (length != 0 && length != 4 && length != 8) {
544 MP_ERR(ctx, "float invalid length %"PRIu64"\n", length);
545 goto error;
546 }
547 *floatptr = ebml_parse_float(data, length);
548 MP_DBG(ctx, "float %f\n", *floatptr);
549 break;
550
551 case EBML_TYPE_STR:
552 if (length > 1024 * 1024) {
553 MP_ERR(ctx, "Not reading overly long string element.\n");
554 break;
555 }
556 char **strptr;
557 GETPTR(strptr, char *);
558 *strptr = talloc_strndup(ctx->talloc_ctx, data, length);
559 MP_TRACE(ctx, "string \"%s\"\n", *strptr);
560 break;
561
562 case EBML_TYPE_BINARY:;
563 if (length > 0x80000000) {
564 MP_ERR(ctx, "Not reading overly long EBML element.\n");
565 break;
566 }
567 struct bstr *binptr;
568 GETPTR(binptr, struct bstr);
569 binptr->start = data;
570 binptr->len = length;
571 MP_TRACE(ctx, "binary %zd bytes\n", binptr->len);
572 break;
573
574 case EBML_TYPE_EBML_ID:;
575 uint32_t *idptr;
576 GETPTR(idptr, uint32_t);
577 *idptr = ebml_parse_id(data, end - data, &len);
578 if (len != length) {
579 MP_ERR(ctx, "ebml_id broken value\n");
580 goto error;
581 }
582 MP_TRACE(ctx, "ebml_id %x\n", (unsigned)*idptr);
583 break;
584 default:
585 abort();
586 }
587 *countptr += 1;
588 error:
589 data += length;
590 }
591 }
592
593 // target must be initialized to zero
ebml_read_element(struct stream * s,struct ebml_parse_ctx * ctx,void * target,const struct ebml_elem_desc * desc)594 int ebml_read_element(struct stream *s, struct ebml_parse_ctx *ctx,
595 void *target, const struct ebml_elem_desc *desc)
596 {
597 ctx->has_errors = false;
598 int msglevel = ctx->no_error_messages ? MSGL_DEBUG : MSGL_WARN;
599 uint64_t length = ebml_read_length(s);
600 if (s->eof) {
601 MP_MSG(ctx, msglevel, "Unexpected end of file "
602 "- partial or corrupt file?\n");
603 return -1;
604 }
605 if (length == EBML_UINT_INVALID) {
606 MP_MSG(ctx, msglevel, "EBML element with unknown length - unsupported\n");
607 return -1;
608 }
609 if (length > 1000000000) {
610 MP_MSG(ctx, msglevel, "Refusing to read element over 100 MB in size\n");
611 return -1;
612 }
613 ctx->talloc_ctx = talloc_size(NULL, length);
614 int read_len = stream_read(s, ctx->talloc_ctx, length);
615 if (read_len < length)
616 MP_MSG(ctx, msglevel, "Unexpected end of file - partial or corrupt file?\n");
617 ebml_parse_element(ctx, target, ctx->talloc_ctx, read_len, desc, 0);
618 if (ctx->has_errors)
619 MP_MSG(ctx, msglevel, "Error parsing element %s\n", desc->name);
620 return 0;
621 }
622