1 /*
2  * native ebml reader for the Matroska demuxer
3  * new parser copyright (c) 2010 Uoti Urpala
4  * copyright (c) 2004 Aurelien Jacobs <aurel@gnuage.org>
5  * based on the one written by Ronald Bultje for gstreamer
6  *
7  * This file is part of mpv.
8  *
9  * mpv is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * mpv is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 #include "config.h"
24 
25 #include <stdlib.h>
26 #include <stdbool.h>
27 #include <inttypes.h>
28 #include <stddef.h>
29 #include <assert.h>
30 
31 #include <libavutil/intfloat.h>
32 #include <libavutil/common.h>
33 #include "mpv_talloc.h"
34 #include "ebml.h"
35 #include "stream/stream.h"
36 #include "common/msg.h"
37 
38 // Whether the id is a known Matroska level 1 element (allowed as element on
39 // global file level, after the level 0 MATROSKA_ID_SEGMENT).
40 // This (intentionally) doesn't include "global" elements.
ebml_is_mkv_level1_id(uint32_t id)41 bool ebml_is_mkv_level1_id(uint32_t id)
42 {
43     switch (id) {
44     case MATROSKA_ID_SEEKHEAD:
45     case MATROSKA_ID_INFO:
46     case MATROSKA_ID_CLUSTER:
47     case MATROSKA_ID_TRACKS:
48     case MATROSKA_ID_CUES:
49     case MATROSKA_ID_ATTACHMENTS:
50     case MATROSKA_ID_CHAPTERS:
51     case MATROSKA_ID_TAGS:
52         return true;
53     default:
54         return false;
55     }
56 }
57 
58 /*
59  * Read: the element content data ID.
60  * Return: the ID.
61  */
ebml_read_id(stream_t * s)62 uint32_t ebml_read_id(stream_t *s)
63 {
64     int i, len_mask = 0x80;
65     uint32_t id;
66 
67     for (i = 0, id = stream_read_char(s); i < 4 && !(id & len_mask); i++)
68         len_mask >>= 1;
69     if (i >= 4)
70         return EBML_ID_INVALID;
71     while (i--)
72         id = (id << 8) | stream_read_char(s);
73     return id;
74 }
75 
76 /*
77  * Read: element content length.
78  */
ebml_read_length(stream_t * s)79 uint64_t ebml_read_length(stream_t *s)
80 {
81     int i, j, num_ffs = 0, len_mask = 0x80;
82     uint64_t len;
83 
84     for (i = 0, len = stream_read_char(s); i < 8 && !(len & len_mask); i++)
85         len_mask >>= 1;
86     if (i >= 8)
87         return EBML_UINT_INVALID;
88     j = i + 1;
89     if ((int) (len &= (len_mask - 1)) == len_mask - 1)
90         num_ffs++;
91     while (i--) {
92         len = (len << 8) | stream_read_char(s);
93         if ((len & 0xFF) == 0xFF)
94             num_ffs++;
95     }
96     if (j == num_ffs)
97         return EBML_UINT_INVALID;
98     if (len >= 1ULL<<63)   // Can happen if stream_read_char returns EOF
99         return EBML_UINT_INVALID;
100     return len;
101 }
102 
103 
104 /*
105  * Read a variable length signed int.
106  */
ebml_read_signed_length(stream_t * s)107 int64_t ebml_read_signed_length(stream_t *s)
108 {
109     uint64_t unum;
110     int l;
111 
112     /* read as unsigned number first */
113     uint64_t offset = stream_tell(s);
114     unum = ebml_read_length(s);
115     if (unum == EBML_UINT_INVALID)
116         return EBML_INT_INVALID;
117     l = stream_tell(s) - offset;
118 
119     return unum - ((1LL << ((7 * l) - 1)) - 1);
120 }
121 
122 /*
123  * Read the next element as an unsigned int.
124  */
ebml_read_uint(stream_t * s)125 uint64_t ebml_read_uint(stream_t *s)
126 {
127     uint64_t len, value = 0;
128 
129     len = ebml_read_length(s);
130     if (len == EBML_UINT_INVALID || len > 8)
131         return EBML_UINT_INVALID;
132 
133     while (len--)
134         value = (value << 8) | stream_read_char(s);
135 
136     return value;
137 }
138 
139 /*
140  * Read the next element as a signed int.
141  */
ebml_read_int(stream_t * s)142 int64_t ebml_read_int(stream_t *s)
143 {
144     uint64_t value = 0;
145     uint64_t len;
146     int l;
147 
148     len = ebml_read_length(s);
149     if (len == EBML_UINT_INVALID || len > 8)
150         return EBML_INT_INVALID;
151     if (!len)
152         return 0;
153 
154     len--;
155     l = stream_read_char(s);
156     if (l & 0x80)
157         value = -1;
158     value = (value << 8) | l;
159     while (len--)
160         value = (value << 8) | stream_read_char(s);
161 
162     return (int64_t)value; // assume complement of 2
163 }
164 
165 /*
166  * Skip the current element.
167  * end: the end of the parent element or -1 (for robust error handling)
168  */
ebml_read_skip(struct mp_log * log,int64_t end,stream_t * s)169 int ebml_read_skip(struct mp_log *log, int64_t end, stream_t *s)
170 {
171     uint64_t len;
172 
173     int64_t pos = stream_tell(s);
174 
175     len = ebml_read_length(s);
176     if (len == EBML_UINT_INVALID)
177         goto invalid;
178 
179     int64_t pos2 = stream_tell(s);
180     if (len >= INT64_MAX - pos2 || (end > 0 && pos2 + len > end))
181         goto invalid;
182 
183     if (!stream_seek_skip(s, pos2 + len))
184         goto invalid;
185 
186     return 0;
187 
188 invalid:
189     mp_err(log, "Invalid EBML length at position %"PRId64"\n", pos);
190     stream_seek_skip(s, pos);
191     return 1;
192 }
193 
194 /*
195  * Skip to (probable) next cluster (MATROSKA_ID_CLUSTER) element start position.
196  */
ebml_resync_cluster(struct mp_log * log,stream_t * s)197 int ebml_resync_cluster(struct mp_log *log, stream_t *s)
198 {
199     int64_t pos = stream_tell(s);
200     uint32_t last_4_bytes = 0;
201     stream_read_peek(s, &(char){0}, 1);
202     if (!s->eof) {
203         mp_err(log, "Corrupt file detected. "
204                "Trying to resync starting from position %"PRId64"...\n", pos);
205     }
206     while (!s->eof) {
207         // Assumes MATROSKA_ID_CLUSTER is 4 bytes, with no 0 bytes.
208         if (last_4_bytes == MATROSKA_ID_CLUSTER) {
209             mp_err(log, "Cluster found at %"PRId64".\n", pos - 4);
210             stream_seek(s, pos - 4);
211             return 0;
212         }
213         last_4_bytes = (last_4_bytes << 8) | stream_read_char(s);
214         pos++;
215     }
216     return -1;
217 }
218 
219 
220 
221 #define EVALARGS(F, ...) F(__VA_ARGS__)
222 #define E(str, N, type) const struct ebml_elem_desc ebml_ ## N ## _desc = { str, type };
223 #define E_SN(str, count, N) const struct ebml_elem_desc ebml_ ## N ## _desc = { str, EBML_TYPE_SUBELEMENTS, sizeof(struct ebml_ ## N), count, (const struct ebml_field_desc[]){
224 #define E_S(str, count) EVALARGS(E_SN, str, count, N)
225 #define FN(id, name, multiple, N) { id, multiple, offsetof(struct ebml_ ## N, name), offsetof(struct ebml_ ## N, n_ ## name), &ebml_##name##_desc},
226 #define F(id, name, multiple) EVALARGS(FN, id, name, multiple, N)
227 #include "generated/ebml_defs.c"
228 #undef EVALARGS
229 #undef SN
230 #undef S
231 #undef FN
232 #undef F
233 
234 // Used to read/write pointers to different struct types
235 struct generic;
236 #define generic_struct struct generic
237 
ebml_parse_id(uint8_t * data,size_t data_len,int * length)238 static uint32_t ebml_parse_id(uint8_t *data, size_t data_len, int *length)
239 {
240     *length = -1;
241     uint8_t *end = data + data_len;
242     if (data == end)
243         return EBML_ID_INVALID;
244     int len = 1;
245     uint32_t id = *data++;
246     for (int len_mask = 0x80; !(id & len_mask); len_mask >>= 1) {
247         len++;
248         if (len > 4)
249             return EBML_ID_INVALID;
250     }
251     *length = len;
252     while (--len && data < end)
253         id = (id << 8) | *data++;
254     return id;
255 }
256 
ebml_parse_length(uint8_t * data,size_t data_len,int * length)257 static uint64_t ebml_parse_length(uint8_t *data, size_t data_len, int *length)
258 {
259     *length = -1;
260     uint8_t *end = data + data_len;
261     if (data == end)
262         return -1;
263     uint64_t r = *data++;
264     int len = 1;
265     int len_mask;
266     for (len_mask = 0x80; !(r & len_mask); len_mask >>= 1) {
267         len++;
268         if (len > 8)
269             return -1;
270     }
271     r &= len_mask - 1;
272 
273     int num_allones = 0;
274     if (r == len_mask - 1)
275         num_allones++;
276     for (int i = 1; i < len; i++) {
277         if (data == end)
278             return -1;
279         if (*data == 255)
280             num_allones++;
281         r = (r << 8) | *data++;
282     }
283     // According to Matroska specs this means "unknown length"
284     // Could be supported if there are any actual files using it
285     if (num_allones == len)
286         return -1;
287     *length = len;
288     return r;
289 }
290 
ebml_parse_uint(uint8_t * data,int length)291 static uint64_t ebml_parse_uint(uint8_t *data, int length)
292 {
293     assert(length >= 0 && length <= 8);
294     uint64_t r = 0;
295     while (length--)
296         r = (r << 8) + *data++;
297     return r;
298 }
299 
ebml_parse_sint(uint8_t * data,int length)300 static int64_t ebml_parse_sint(uint8_t *data, int length)
301 {
302     assert(length >= 0 && length <= 8);
303     if (!length)
304         return 0;
305     uint64_t r = 0;
306     if (*data & 0x80)
307         r = -1;
308     while (length--)
309         r = (r << 8) | *data++;
310     return (int64_t)r; // assume complement of 2
311 }
312 
ebml_parse_float(uint8_t * data,int length)313 static double ebml_parse_float(uint8_t *data, int length)
314 {
315     assert(length == 0 || length == 4 || length == 8);
316     uint64_t i = ebml_parse_uint(data, length);
317     if (length == 4)
318         return av_int2float(i);
319     else
320         return av_int2double(i);
321 }
322 
323 
324 // target must be initialized to zero
ebml_parse_element(struct ebml_parse_ctx * ctx,void * target,uint8_t * data,int size,const struct ebml_elem_desc * type,int level)325 static void ebml_parse_element(struct ebml_parse_ctx *ctx, void *target,
326                                uint8_t *data, int size,
327                                const struct ebml_elem_desc *type, int level)
328 {
329     assert(type->type == EBML_TYPE_SUBELEMENTS);
330     assert(level < 8);
331     MP_TRACE(ctx, "%.*sParsing element %s\n", level, "       ", type->name);
332 
333     char *s = target;
334     uint8_t *end = data + size;
335     uint8_t *p = data;
336     int num_elems[MAX_EBML_SUBELEMENTS] = {0};
337     while (p < end) {
338         uint8_t *startp = p;
339         int len;
340         uint32_t id = ebml_parse_id(p, end - p, &len);
341         if (len > end - p)
342             goto past_end_error;
343         if (len < 0) {
344             MP_ERR(ctx, "Error parsing subelement id\n");
345             goto other_error;
346         }
347         p += len;
348         uint64_t length = ebml_parse_length(p, end - p, &len);
349         if (len > end - p)
350             goto past_end_error;
351         if (len < 0) {
352             MP_ERR(ctx, "Error parsing subelement length\n");
353             goto other_error;
354         }
355         p += len;
356 
357         int field_idx = -1;
358         for (int i = 0; i < type->field_count; i++)
359             if (type->fields[i].id == id) {
360                 field_idx = i;
361                 num_elems[i]++;
362                 if (num_elems[i] >= 0x70000000) {
363                     MP_ERR(ctx, "Too many EBML subelements.\n");
364                     goto other_error;
365                 }
366                 break;
367             }
368 
369         if (length > end - p) {
370             if (field_idx >= 0 && type->fields[field_idx].desc->type
371                 != EBML_TYPE_SUBELEMENTS) {
372                 MP_ERR(ctx, "Subelement content goes "
373                        "past end of containing element\n");
374                 goto other_error;
375             }
376             // Try to parse what is possible from inside this partial element
377             ctx->has_errors = true;
378             length = end - p;
379         }
380         p += length;
381 
382         continue;
383 
384     past_end_error:
385         MP_ERR(ctx, "Subelement headers go past end of containing element\n");
386     other_error:
387         ctx->has_errors = true;
388         end = startp;
389         break;
390     }
391 
392     for (int i = 0; i < type->field_count; i++) {
393         if (num_elems[i] && type->fields[i].multiple) {
394             char *ptr = s + type->fields[i].offset;
395             switch (type->fields[i].desc->type) {
396             case EBML_TYPE_SUBELEMENTS: {
397                 size_t max = 1000000000 / type->fields[i].desc->size;
398                 if (num_elems[i] > max) {
399                     MP_ERR(ctx, "Too many subelements.\n");
400                     num_elems[i] = max;
401                 }
402                 int sz = num_elems[i] * type->fields[i].desc->size;
403                 *(generic_struct **) ptr = talloc_zero_size(ctx->talloc_ctx, sz);
404                 break;
405             }
406             case EBML_TYPE_UINT:
407                 *(uint64_t **) ptr = talloc_zero_array(ctx->talloc_ctx,
408                                                        uint64_t, num_elems[i]);
409                 break;
410             case EBML_TYPE_SINT:
411                 *(int64_t **) ptr = talloc_zero_array(ctx->talloc_ctx,
412                                                       int64_t, num_elems[i]);
413                 break;
414             case EBML_TYPE_FLOAT:
415                 *(double **) ptr = talloc_zero_array(ctx->talloc_ctx,
416                                                      double, num_elems[i]);
417                 break;
418             case EBML_TYPE_STR:
419                 *(char ***) ptr = talloc_zero_array(ctx->talloc_ctx,
420                                                     char *, num_elems[i]);
421                 break;
422             case EBML_TYPE_BINARY:
423                 *(struct bstr **) ptr = talloc_zero_array(ctx->talloc_ctx,
424                                                           struct bstr,
425                                                           num_elems[i]);
426                 break;
427             case EBML_TYPE_EBML_ID:
428                 *(int32_t **) ptr = talloc_zero_array(ctx->talloc_ctx,
429                                                       uint32_t, num_elems[i]);
430                 break;
431             default:
432                 abort();
433             }
434         }
435     }
436 
437     while (data < end) {
438         int len;
439         uint32_t id = ebml_parse_id(data, end - data, &len);
440         if (len < 0 || len > end - data) {
441             MP_ERR(ctx, "Error parsing subelement\n");
442             break;
443         }
444         data += len;
445         uint64_t length = ebml_parse_length(data, end - data, &len);
446         if (len < 0 || len > end - data) {
447             MP_ERR(ctx, "Error parsing subelement length\n");
448             break;
449         }
450         data += len;
451         if (length > end - data) {
452             // Try to parse what is possible from inside this partial element
453             length = end - data;
454             MP_ERR(ctx, "Next subelement content goes "
455                    "past end of containing element, will be truncated\n");
456         }
457         int field_idx = -1;
458         for (int i = 0; i < type->field_count; i++)
459             if (type->fields[i].id == id) {
460                 field_idx = i;
461                 break;
462             }
463         if (field_idx < 0) {
464             if (id == 0xec) {
465                 MP_TRACE(ctx, "%.*sIgnoring Void element "
466                          "size: %"PRIu64"\n", level+1, "        ", length);
467             } else if (id == 0xbf) {
468                 MP_TRACE(ctx, "%.*sIgnoring CRC-32 "
469                          "element size: %"PRIu64"\n", level+1, "        ",
470                          length);
471             } else {
472                 MP_DBG(ctx, "Ignoring unrecognized "
473                        "subelement. ID: %x size: %"PRIu64"\n", id, length);
474             }
475             data += length;
476             continue;
477         }
478         const struct ebml_field_desc *fd = &type->fields[field_idx];
479         const struct ebml_elem_desc *ed = fd->desc;
480         bool multiple = fd->multiple;
481         int *countptr = (int *) (s + fd->count_offset);
482         if (*countptr >= num_elems[field_idx]) {
483             // Shouldn't happen on any sane file without bugs
484             MP_ERR(ctx, "Too many subelements.\n");
485             ctx->has_errors = true;
486             data += length;
487             continue;
488         }
489         if (*countptr > 0 && !multiple) {
490             MP_WARN(ctx, "Another subelement of type "
491                     "%x %s (size: %"PRIu64"). Only one allowed. Ignoring.\n",
492                     id, ed->name, length);
493             ctx->has_errors = true;
494             data += length;
495             continue;
496         }
497         MP_TRACE(ctx, "%.*sParsing %x %s size: %"PRIu64
498                  " value: ", level+1, "        ", id, ed->name, length);
499 
500         char *fieldptr = s + fd->offset;
501         switch (ed->type) {
502         case EBML_TYPE_SUBELEMENTS:
503             MP_TRACE(ctx, "subelements\n");
504             char *subelptr;
505             if (multiple) {
506                 char *array_start = (char *) *(generic_struct **) fieldptr;
507                 subelptr = array_start + *countptr * ed->size;
508             } else
509                 subelptr = fieldptr;
510             ebml_parse_element(ctx, subelptr, data, length, ed, level + 1);
511             break;
512 
513         case EBML_TYPE_UINT:;
514             uint64_t *uintptr;
515 #define GETPTR(subelptr, fieldtype)                                     \
516             if (multiple)                                               \
517                 subelptr = *(fieldtype **) fieldptr + *countptr;        \
518             else                                                        \
519                 subelptr = (fieldtype *) fieldptr
520             GETPTR(uintptr, uint64_t);
521             if (length < 1 || length > 8) {
522                 MP_ERR(ctx, "uint invalid length %"PRIu64"\n", length);
523                 goto error;
524             }
525             *uintptr = ebml_parse_uint(data, length);
526             MP_TRACE(ctx, "uint %"PRIu64"\n", *uintptr);
527             break;
528 
529         case EBML_TYPE_SINT:;
530             int64_t *sintptr;
531             GETPTR(sintptr, int64_t);
532             if (length > 8) {
533                 MP_ERR(ctx, "sint invalid length %"PRIu64"\n", length);
534                 goto error;
535             }
536             *sintptr = ebml_parse_sint(data, length);
537             MP_TRACE(ctx, "sint %"PRId64"\n", *sintptr);
538             break;
539 
540         case EBML_TYPE_FLOAT:;
541             double *floatptr;
542             GETPTR(floatptr, double);
543             if (length != 0 && length != 4 && length != 8) {
544                 MP_ERR(ctx, "float invalid length %"PRIu64"\n", length);
545                 goto error;
546             }
547             *floatptr = ebml_parse_float(data, length);
548             MP_DBG(ctx, "float %f\n", *floatptr);
549             break;
550 
551         case EBML_TYPE_STR:
552             if (length > 1024 * 1024) {
553                 MP_ERR(ctx, "Not reading overly long string element.\n");
554                 break;
555             }
556             char **strptr;
557             GETPTR(strptr, char *);
558             *strptr = talloc_strndup(ctx->talloc_ctx, data, length);
559             MP_TRACE(ctx, "string \"%s\"\n", *strptr);
560             break;
561 
562         case EBML_TYPE_BINARY:;
563             if (length > 0x80000000) {
564                 MP_ERR(ctx, "Not reading overly long EBML element.\n");
565                 break;
566             }
567             struct bstr *binptr;
568             GETPTR(binptr, struct bstr);
569             binptr->start = data;
570             binptr->len = length;
571             MP_TRACE(ctx, "binary %zd bytes\n", binptr->len);
572             break;
573 
574         case EBML_TYPE_EBML_ID:;
575             uint32_t *idptr;
576             GETPTR(idptr, uint32_t);
577             *idptr = ebml_parse_id(data, end - data, &len);
578             if (len != length) {
579                 MP_ERR(ctx, "ebml_id broken value\n");
580                 goto error;
581             }
582             MP_TRACE(ctx, "ebml_id %x\n", (unsigned)*idptr);
583             break;
584         default:
585             abort();
586         }
587         *countptr += 1;
588     error:
589         data += length;
590     }
591 }
592 
593 // target must be initialized to zero
ebml_read_element(struct stream * s,struct ebml_parse_ctx * ctx,void * target,const struct ebml_elem_desc * desc)594 int ebml_read_element(struct stream *s, struct ebml_parse_ctx *ctx,
595                       void *target, const struct ebml_elem_desc *desc)
596 {
597     ctx->has_errors = false;
598     int msglevel = ctx->no_error_messages ? MSGL_DEBUG : MSGL_WARN;
599     uint64_t length = ebml_read_length(s);
600     if (s->eof) {
601         MP_MSG(ctx, msglevel, "Unexpected end of file "
602                    "- partial or corrupt file?\n");
603         return -1;
604     }
605     if (length == EBML_UINT_INVALID) {
606         MP_MSG(ctx, msglevel, "EBML element with unknown length - unsupported\n");
607         return -1;
608     }
609     if (length > 1000000000) {
610         MP_MSG(ctx, msglevel, "Refusing to read element over 100 MB in size\n");
611         return -1;
612     }
613     ctx->talloc_ctx = talloc_size(NULL, length);
614     int read_len = stream_read(s, ctx->talloc_ctx, length);
615     if (read_len < length)
616         MP_MSG(ctx, msglevel, "Unexpected end of file - partial or corrupt file?\n");
617     ebml_parse_element(ctx, target, ctx->talloc_ctx, read_len, desc, 0);
618     if (ctx->has_errors)
619         MP_MSG(ctx, msglevel, "Error parsing element %s\n", desc->name);
620     return 0;
621 }
622