1# -*- coding: utf-8 -*-
2# enzyme - Video metadata parser
3# Copyright 2011-2012 Antoine Bertin <diaoulael@gmail.com>
4# Copyright 2003-2006 Thomas Schueppel <stain@acm.org>
5# Copyright 2003-2006 Dirk Meyer <dischi@freevo.org>
6# Copyright 2003-2006 Jason Tackaberry <tack@urandom.ca>
7#
8# This file is part of enzyme.
9#
10# enzyme is free software; you can redistribute it and/or modify it under
11# the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 3 of the License, or
13# (at your option) any later version.
14#
15# enzyme is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with enzyme.  If not, see <http://www.gnu.org/licenses/>.
22from __future__ import absolute_import
23from datetime import datetime
24from .exceptions import ParseError
25from struct import unpack
26from . import core
27import logging
28import re
29
30__all__ = ['Parser']
31
32
33# get logging object
34log = logging.getLogger(__name__)
35
36# Main IDs for the Matroska streams
37MATROSKA_VIDEO_TRACK = 0x01
38MATROSKA_AUDIO_TRACK = 0x02
39MATROSKA_SUBTITLES_TRACK = 0x11
40
41MATROSKA_HEADER_ID = 0x1A45DFA3
42MATROSKA_TRACKS_ID = 0x1654AE6B
43MATROSKA_CUES_ID = 0x1C53BB6B
44MATROSKA_SEGMENT_ID = 0x18538067
45MATROSKA_SEGMENT_INFO_ID = 0x1549A966
46MATROSKA_CLUSTER_ID = 0x1F43B675
47MATROSKA_VOID_ID = 0xEC
48MATROSKA_CRC_ID = 0xBF
49MATROSKA_TIMECODESCALE_ID = 0x2AD7B1
50MATROSKA_DURATION_ID = 0x4489
51MATROSKA_CRC32_ID = 0xBF
52MATROSKA_TIMECODESCALE_ID = 0x2AD7B1
53MATROSKA_MUXING_APP_ID = 0x4D80
54MATROSKA_WRITING_APP_ID = 0x5741
55MATROSKA_CODEC_ID = 0x86
56MATROSKA_CODEC_PRIVATE_ID = 0x63A2
57MATROSKA_FRAME_DURATION_ID = 0x23E383
58MATROSKA_VIDEO_SETTINGS_ID = 0xE0
59MATROSKA_VIDEO_WIDTH_ID = 0xB0
60MATROSKA_VIDEO_HEIGHT_ID = 0xBA
61MATROSKA_VIDEO_INTERLACED_ID = 0x9A
62MATROSKA_VIDEO_DISPLAY_WIDTH_ID = 0x54B0
63MATROSKA_VIDEO_DISPLAY_HEIGHT_ID = 0x54BA
64MATROSKA_AUDIO_SETTINGS_ID = 0xE1
65MATROSKA_AUDIO_SAMPLERATE_ID = 0xB5
66MATROSKA_AUDIO_CHANNELS_ID = 0x9F
67MATROSKA_TRACK_UID_ID = 0x73C5
68MATROSKA_TRACK_NUMBER_ID = 0xD7
69MATROSKA_TRACK_TYPE_ID = 0x83
70MATROSKA_TRACK_LANGUAGE_ID = 0x22B59C
71MATROSKA_TRACK_OFFSET = 0x537F
72MATROSKA_TRACK_FLAG_DEFAULT_ID = 0x88
73MATROSKA_TRACK_FLAG_ENABLED_ID = 0xB9
74MATROSKA_TITLE_ID = 0x7BA9
75MATROSKA_DATE_UTC_ID = 0x4461
76MATROSKA_NAME_ID = 0x536E
77
78MATROSKA_CHAPTERS_ID = 0x1043A770
79MATROSKA_CHAPTER_UID_ID = 0x73C4
80MATROSKA_EDITION_ENTRY_ID = 0x45B9
81MATROSKA_CHAPTER_ATOM_ID = 0xB6
82MATROSKA_CHAPTER_TIME_START_ID = 0x91
83MATROSKA_CHAPTER_TIME_END_ID = 0x92
84MATROSKA_CHAPTER_FLAG_ENABLED_ID = 0x4598
85MATROSKA_CHAPTER_DISPLAY_ID = 0x80
86MATROSKA_CHAPTER_LANGUAGE_ID = 0x437C
87MATROSKA_CHAPTER_STRING_ID = 0x85
88
89MATROSKA_ATTACHMENTS_ID = 0x1941A469
90MATROSKA_ATTACHED_FILE_ID = 0x61A7
91MATROSKA_FILE_DESC_ID = 0x467E
92MATROSKA_FILE_NAME_ID = 0x466E
93MATROSKA_FILE_MIME_TYPE_ID = 0x4660
94MATROSKA_FILE_DATA_ID = 0x465C
95
96MATROSKA_SEEKHEAD_ID = 0x114D9B74
97MATROSKA_SEEK_ID = 0x4DBB
98MATROSKA_SEEKID_ID = 0x53AB
99MATROSKA_SEEK_POSITION_ID = 0x53AC
100
101MATROSKA_TAGS_ID = 0x1254C367
102MATROSKA_TAG_ID = 0x7373
103MATROSKA_TARGETS_ID = 0x63C0
104MATROSKA_TARGET_TYPE_VALUE_ID = 0x68CA
105MATROSKA_TARGET_TYPE_ID = 0x63CA
106MATRSOKA_TAGS_TRACK_UID_ID = 0x63C5
107MATRSOKA_TAGS_EDITION_UID_ID = 0x63C9
108MATRSOKA_TAGS_CHAPTER_UID_ID = 0x63C4
109MATRSOKA_TAGS_ATTACHMENT_UID_ID = 0x63C6
110MATROSKA_SIMPLE_TAG_ID = 0x67C8
111MATROSKA_TAG_NAME_ID = 0x45A3
112MATROSKA_TAG_LANGUAGE_ID = 0x447A
113MATROSKA_TAG_STRING_ID = 0x4487
114MATROSKA_TAG_BINARY_ID = 0x4485
115
116
117# See mkv spec for details:
118# http://www.matroska.org/technical/specs/index.html
119
120# Map to convert to well known codes
121# http://haali.cs.msu.ru/mkv/codecs.pdf
122FOURCCMap = {
123    'V_THEORA': 'THEO',
124    'V_SNOW': 'SNOW',
125    'V_MPEG4/ISO/ASP': 'MP4V',
126    'V_MPEG4/ISO/AVC': 'AVC1',
127    'A_AC3': 0x2000,
128    'A_MPEG/L3': 0x0055,
129    'A_MPEG/L2': 0x0050,
130    'A_MPEG/L1': 0x0050,
131    'A_DTS': 0x2001,
132    'A_PCM/INT/LIT': 0x0001,
133    'A_PCM/FLOAT/IEEE': 0x003,
134    'A_TTA1': 0x77a1,
135    'A_WAVPACK4': 0x5756,
136    'A_VORBIS': 0x6750,
137    'A_FLAC': 0xF1AC,
138    'A_AAC': 0x00ff,
139    'A_AAC/': 0x00ff
140}
141
142
143def matroska_date_to_datetime(date):
144    """
145    Converts a date in Matroska's date format to a python datetime object.
146    Returns the given date string if it could not be converted.
147    """
148    # From the specs:
149    #   The fields with dates should have the following format: YYYY-MM-DD
150    #   HH:MM:SS.MSS [...] To store less accuracy, you remove items starting
151    #   from the right. To store only the year, you would use, "2004". To store
152    #   a specific day such as May 1st, 2003, you would use "2003-05-01".
153    format = re.split(r'([-:. ])', '%Y-%m-%d %H:%M:%S.%f')
154    while format:
155        try:
156            return datetime.strptime(date, ''.join(format))
157        except ValueError:
158            format = format[:-2]
159    return date
160
161
162def matroska_bps_to_bitrate(bps):
163    """
164    Tries to convert a free-form bps string into a bitrate (bits per second).
165    """
166    m = re.search('([\d.]+)\s*(\D.*)', bps)
167    if m:
168        bps, suffix = m.groups()
169        if 'kbit' in suffix:
170            return float(bps) * 1024
171        elif 'kbyte' in suffix:
172            return float(bps) * 1024 * 8
173        elif 'byte' in suffix:
174            return float(bps) * 8
175        elif 'bps' in suffix or 'bit' in suffix:
176            return float(bps)
177    if bps.replace('.', '').isdigit():
178        if float(bps) < 30000:
179            # Assume kilobits and convert to bps
180            return float(bps) * 1024
181        return float(bps)
182
183
184# Used to convert the official matroska tag names (only lower-cased) to core
185# attributes.  tag name -> attr, filter
186TAGS_MAP = {
187    # From Media core
188    'title': ('title', None),
189    'subtitle': ('caption', None),
190    'comment': ('comment', None),
191    'url': ('url', None),
192    'artist': ('artist', None),
193    'keywords': ('keywords', lambda s: [word.strip() for word in s.split(',')]),
194    'composer_nationality': ('country', None),
195    'date_released': ('datetime', None),
196    'date_recorded': ('datetime', None),
197    'date_written': ('datetime', None),
198
199    # From Video core
200    'encoder': ('encoder', None),
201    'bps': ('bitrate', matroska_bps_to_bitrate),
202    'part_number': ('trackno', int),
203    'total_parts': ('trackof', int),
204    'copyright': ('copyright', None),
205    'genre': ('genre', None),
206    'actor': ('actors', None),
207    'written_by': ('writer', None),
208    'producer': ('producer', None),
209    'production_studio': ('studio', None),
210    'law_rating': ('rating', None),
211    'summary': ('summary', None),
212    'synopsis': ('synopsis', None),
213}
214
215
216class EbmlEntity:
217    """
218    This is class that is responsible to handle one Ebml entity as described in
219    the Matroska/Ebml spec
220    """
221    def __init__(self, inbuf):
222        # Compute the EBML id
223        # Set the CRC len to zero
224        self.crc_len = 0
225        # Now loop until we find an entity without CRC
226        try:
227            self.build_entity(inbuf)
228        except IndexError:
229            raise ParseError()
230        while self.get_id() == MATROSKA_CRC32_ID:
231            self.crc_len += self.get_total_len()
232            inbuf = inbuf[self.get_total_len():]
233            self.build_entity(inbuf)
234
235    def build_entity(self, inbuf):
236        self.compute_id(inbuf)
237
238        if self.id_len == 0:
239            log.error('EBML entity not found, bad file format')
240            raise ParseError()
241
242        self.entity_len, self.len_size = self.compute_len(inbuf[self.id_len:])
243        self.entity_data = inbuf[self.get_header_len() : self.get_total_len()]
244        self.ebml_length = self.entity_len
245        self.entity_len = min(len(self.entity_data), self.entity_len)
246
247        # if the data size is 8 or less, it could be a numeric value
248        self.value = 0
249        if self.entity_len <= 8:
250            for pos, shift in zip(range(self.entity_len), range((self.entity_len - 1) * 8, -1, -8)):
251                self.value |= ord(self.entity_data[pos]) << shift
252
253
254    def add_data(self, data):
255        maxlen = self.ebml_length - len(self.entity_data)
256        if maxlen <= 0:
257            return
258        self.entity_data += data[:maxlen]
259        self.entity_len = len(self.entity_data)
260
261
262    def compute_id(self, inbuf):
263        self.id_len = 0
264        if len(inbuf) < 1:
265            return 0
266        first = ord(inbuf[0])
267        if first & 0x80:
268            self.id_len = 1
269            self.entity_id = first
270        elif first & 0x40:
271            if len(inbuf) < 2:
272                return 0
273            self.id_len = 2
274            self.entity_id = ord(inbuf[0]) << 8 | ord(inbuf[1])
275        elif first & 0x20:
276            if len(inbuf) < 3:
277                return 0
278            self.id_len = 3
279            self.entity_id = (ord(inbuf[0]) << 16) | (ord(inbuf[1]) << 8) | \
280                             (ord(inbuf[2]))
281        elif first & 0x10:
282            if len(inbuf) < 4:
283                return 0
284            self.id_len = 4
285            self.entity_id = (ord(inbuf[0]) << 24) | (ord(inbuf[1]) << 16) | \
286                             (ord(inbuf[2]) << 8) | (ord(inbuf[3]))
287        self.entity_str = inbuf[0:self.id_len]
288
289
290    def compute_len(self, inbuf):
291        if not inbuf:
292            return 0, 0
293        i = num_ffs = 0
294        len_mask = 0x80
295        len = ord(inbuf[0])
296        while not len & len_mask:
297            i += 1
298            len_mask >>= 1
299            if i >= 8:
300                return 0, 0
301
302        len &= len_mask - 1
303        if len == len_mask - 1:
304            num_ffs += 1
305        for p in range(i):
306            len = (len << 8) | ord(inbuf[p + 1])
307            if len & 0xff == 0xff:
308                num_ffs += 1
309        if num_ffs == i + 1:
310            len = 0
311        return len, i + 1
312
313
314    def get_crc_len(self):
315        return self.crc_len
316
317
318    def get_value(self):
319        return self.value
320
321
322    def get_float_value(self):
323        if len(self.entity_data) == 4:
324            return unpack('!f', self.entity_data)[0]
325        elif len(self.entity_data) == 8:
326            return unpack('!d', self.entity_data)[0]
327        return 0.0
328
329
330    def get_data(self):
331        return self.entity_data
332
333
334    def get_utf8(self):
335        return unicode(self.entity_data, 'utf-8', 'replace')
336
337
338    def get_str(self):
339        return unicode(self.entity_data, 'ascii', 'replace')
340
341
342    def get_id(self):
343        return self.entity_id
344
345
346    def get_str_id(self):
347        return self.entity_str
348
349
350    def get_len(self):
351        return self.entity_len
352
353
354    def get_total_len(self):
355        return self.entity_len + self.id_len + self.len_size
356
357
358    def get_header_len(self):
359        return self.id_len + self.len_size
360
361
362
363class Matroska(core.AVContainer):
364    """
365    Matroska video and audio parser. If at least one video stream is
366    detected it will set the type to MEDIA_AV.
367    """
368    def __init__(self, file):
369        core.AVContainer.__init__(self)
370        self.samplerate = 1
371
372        self.file = file
373        # Read enough that we're likely to get the full seekhead (FIXME: kludge)
374        buffer = file.read(2000)
375        if len(buffer) == 0:
376            # Regular File end
377            raise ParseError()
378
379        # Check the Matroska header
380        header = EbmlEntity(buffer)
381        if header.get_id() != MATROSKA_HEADER_ID:
382            raise ParseError()
383
384        log.debug('HEADER ID found %08X' % header.get_id())
385        self.mime = 'video/x-matroska'
386        self.type = 'Matroska'
387        self.has_idx = False
388        self.objects_by_uid = {}
389
390        # Now get the segment
391        self.segment = segment = EbmlEntity(buffer[header.get_total_len():])
392        # Record file offset of segment data for seekheads
393        self.segment.offset = header.get_total_len() + segment.get_header_len()
394        if segment.get_id() != MATROSKA_SEGMENT_ID:
395            log.debug('SEGMENT ID not found %08X' % segment.get_id())
396            return
397
398        log.debug('SEGMENT ID found %08X' % segment.get_id())
399        try:
400            for elem in self.process_one_level(segment):
401                if elem.get_id() == MATROSKA_SEEKHEAD_ID:
402                    self.process_elem(elem)
403        except ParseError:
404            pass
405
406        if not self.has_idx:
407            log.warning('File has no index')
408            self._set('corrupt', True)
409
410    def process_elem(self, elem):
411        elem_id = elem.get_id()
412        log.debug('BEGIN: process element %r' % hex(elem_id))
413        if elem_id == MATROSKA_SEGMENT_INFO_ID:
414            duration = 0
415            scalecode = 1000000.0
416
417            for ielem in self.process_one_level(elem):
418                ielem_id = ielem.get_id()
419                if ielem_id == MATROSKA_TIMECODESCALE_ID:
420                    scalecode = ielem.get_value()
421                elif ielem_id == MATROSKA_DURATION_ID:
422                    duration = ielem.get_float_value()
423                elif ielem_id == MATROSKA_TITLE_ID:
424                    self.title = ielem.get_utf8()
425                elif ielem_id == MATROSKA_DATE_UTC_ID:
426                    timestamp = unpack('!q', ielem.get_data())[0] / 10.0 ** 9
427                    # Date is offset 2001-01-01 00:00:00 (timestamp 978307200.0)
428                    self.timestamp = int(timestamp + 978307200)
429
430            self.length = duration * scalecode / 1000000000.0
431
432        elif elem_id == MATROSKA_TRACKS_ID:
433            self.process_tracks(elem)
434
435        elif elem_id == MATROSKA_CHAPTERS_ID:
436            self.process_chapters(elem)
437
438        elif elem_id == MATROSKA_ATTACHMENTS_ID:
439            self.process_attachments(elem)
440
441        elif elem_id == MATROSKA_SEEKHEAD_ID:
442            self.process_seekhead(elem)
443
444        elif elem_id == MATROSKA_TAGS_ID:
445            self.process_tags(elem)
446
447        elif elem_id == MATROSKA_CUES_ID:
448            self.has_idx = True
449
450        log.debug('END: process element %r' % hex(elem_id))
451        return True
452
453
454    def process_seekhead(self, elem):
455        for seek_elem in self.process_one_level(elem):
456            if seek_elem.get_id() != MATROSKA_SEEK_ID:
457                continue
458            for sub_elem in self.process_one_level(seek_elem):
459                if sub_elem.get_id() == MATROSKA_SEEKID_ID:
460                    if sub_elem.get_value() == MATROSKA_CLUSTER_ID:
461                        # Not interested in these.
462                        return
463
464                elif sub_elem.get_id() == MATROSKA_SEEK_POSITION_ID:
465                    self.file.seek(self.segment.offset + sub_elem.get_value())
466                    buffer = self.file.read(100)
467                    try:
468                        elem = EbmlEntity(buffer)
469                    except ParseError:
470                        continue
471
472                    # Fetch all data necessary for this element.
473                    elem.add_data(self.file.read(elem.ebml_length))
474                    self.process_elem(elem)
475
476
477    def process_tracks(self, tracks):
478        tracksbuf = tracks.get_data()
479        index = 0
480        while index < tracks.get_len():
481            trackelem = EbmlEntity(tracksbuf[index:])
482            log.debug ('ELEMENT %X found' % trackelem.get_id())
483            self.process_track(trackelem)
484            index += trackelem.get_total_len() + trackelem.get_crc_len()
485
486
487    def process_one_level(self, item):
488        buf = item.get_data()
489        index = 0
490        while index < item.get_len():
491            if len(buf[index:]) == 0:
492                break
493            elem = EbmlEntity(buf[index:])
494            yield elem
495            index += elem.get_total_len() + elem.get_crc_len()
496
497    def set_track_defaults(self, track):
498        track.language = 'eng'
499
500    def process_track(self, track):
501        # Collapse generator into a list since we need to iterate over it
502        # twice.
503        elements = [x for x in self.process_one_level(track)]
504        track_type = [x.get_value() for x in elements if x.get_id() == MATROSKA_TRACK_TYPE_ID]
505        if not track_type:
506            log.debug('Bad track: no type id found')
507            return
508
509        track_type = track_type[0]
510        track = None
511
512        if track_type == MATROSKA_VIDEO_TRACK:
513            log.debug('Video track found')
514            track = self.process_video_track(elements)
515        elif track_type == MATROSKA_AUDIO_TRACK:
516            log.debug('Audio track found')
517            track = self.process_audio_track(elements)
518        elif track_type == MATROSKA_SUBTITLES_TRACK:
519            log.debug('Subtitle track found')
520            track = core.Subtitle()
521            self.set_track_defaults(track)
522            track.id = len(self.subtitles)
523            self.subtitles.append(track)
524            for elem in elements:
525                self.process_track_common(elem, track)
526
527
528    def process_track_common(self, elem, track):
529        elem_id = elem.get_id()
530        if elem_id == MATROSKA_TRACK_LANGUAGE_ID:
531            track.language = elem.get_str()
532            log.debug('Track language found: %r' % track.language)
533        elif elem_id == MATROSKA_NAME_ID:
534            track.title = elem.get_utf8()
535        elif elem_id == MATROSKA_TRACK_NUMBER_ID:
536            track.trackno = elem.get_value()
537        elif elem_id == MATROSKA_TRACK_FLAG_ENABLED_ID:
538            track.enabled = bool(elem.get_value())
539        elif elem_id == MATROSKA_TRACK_FLAG_DEFAULT_ID:
540            track.default = bool(elem.get_value())
541        elif elem_id == MATROSKA_CODEC_ID:
542            track.codec = elem.get_str()
543        elif elem_id == MATROSKA_CODEC_PRIVATE_ID:
544            track.codec_private = elem.get_data()
545        elif elem_id == MATROSKA_TRACK_UID_ID:
546            self.objects_by_uid[elem.get_value()] = track
547
548
549    def process_video_track(self, elements):
550        track = core.VideoStream()
551        # Defaults
552        track.codec = 'Unknown'
553        track.fps = 0
554        self.set_track_defaults(track)
555
556        for elem in elements:
557            elem_id = elem.get_id()
558            if elem_id == MATROSKA_CODEC_ID:
559                track.codec = elem.get_str()
560
561            elif elem_id == MATROSKA_FRAME_DURATION_ID:
562                try:
563                    track.fps = 1 / (pow(10, -9) * (elem.get_value()))
564                except ZeroDivisionError:
565                    pass
566
567            elif elem_id == MATROSKA_VIDEO_SETTINGS_ID:
568                d_width = d_height = None
569                for settings_elem in self.process_one_level(elem):
570                    settings_elem_id = settings_elem.get_id()
571                    if settings_elem_id == MATROSKA_VIDEO_WIDTH_ID:
572                        track.width = settings_elem.get_value()
573                    elif settings_elem_id == MATROSKA_VIDEO_HEIGHT_ID:
574                        track.height = settings_elem.get_value()
575                    elif settings_elem_id == MATROSKA_VIDEO_DISPLAY_WIDTH_ID:
576                        d_width = settings_elem.get_value()
577                    elif settings_elem_id == MATROSKA_VIDEO_DISPLAY_HEIGHT_ID:
578                        d_height = settings_elem.get_value()
579                    elif settings_elem_id == MATROSKA_VIDEO_INTERLACED_ID:
580                        value = int(settings_elem.get_value())
581                        self._set('interlaced', value)
582
583                if None not in [d_width, d_height]:
584                    track.aspect = float(d_width) / d_height
585
586            else:
587                self.process_track_common(elem, track)
588
589        # convert codec information
590        # http://haali.cs.msu.ru/mkv/codecs.pdf
591        if track.codec in FOURCCMap:
592            track.codec = FOURCCMap[track.codec]
593        elif '/' in track.codec and track.codec.split('/')[0] + '/' in FOURCCMap:
594            track.codec = FOURCCMap[track.codec.split('/')[0] + '/']
595        elif track.codec.endswith('FOURCC') and len(track.codec_private or '') == 40:
596            track.codec = track.codec_private[16:20]
597        elif track.codec.startswith('V_REAL/'):
598            track.codec = track.codec[7:]
599        elif track.codec.startswith('V_'):
600            # FIXME: add more video codecs here
601            track.codec = track.codec[2:]
602
603        track.id = len(self.video)
604        self.video.append(track)
605        return track
606
607
608    def process_audio_track(self, elements):
609        track = core.AudioStream()
610        track.codec = 'Unknown'
611        self.set_track_defaults(track)
612
613        for elem in elements:
614            elem_id = elem.get_id()
615            if elem_id == MATROSKA_CODEC_ID:
616                track.codec = elem.get_str()
617            elif elem_id == MATROSKA_AUDIO_SETTINGS_ID:
618                for settings_elem in self.process_one_level(elem):
619                    settings_elem_id = settings_elem.get_id()
620                    if settings_elem_id == MATROSKA_AUDIO_SAMPLERATE_ID:
621                        track.samplerate = settings_elem.get_float_value()
622                    elif settings_elem_id == MATROSKA_AUDIO_CHANNELS_ID:
623                        track.channels = settings_elem.get_value()
624            else:
625                self.process_track_common(elem, track)
626
627
628        if track.codec in FOURCCMap:
629            track.codec = FOURCCMap[track.codec]
630        elif '/' in track.codec and track.codec.split('/')[0] + '/' in FOURCCMap:
631            track.codec = FOURCCMap[track.codec.split('/')[0] + '/']
632        elif track.codec.startswith('A_'):
633            track.codec = track.codec[2:]
634
635        track.id = len(self.audio)
636        self.audio.append(track)
637        return track
638
639
640    def process_chapters(self, chapters):
641        elements = self.process_one_level(chapters)
642        for elem in elements:
643            if elem.get_id() == MATROSKA_EDITION_ENTRY_ID:
644                buf = elem.get_data()
645                index = 0
646                while index < elem.get_len():
647                    sub_elem = EbmlEntity(buf[index:])
648                    if sub_elem.get_id() == MATROSKA_CHAPTER_ATOM_ID:
649                        self.process_chapter_atom(sub_elem)
650                    index += sub_elem.get_total_len() + sub_elem.get_crc_len()
651
652
653    def process_chapter_atom(self, atom):
654        elements = self.process_one_level(atom)
655        chap = core.Chapter()
656
657        for elem in elements:
658            elem_id = elem.get_id()
659            if elem_id == MATROSKA_CHAPTER_TIME_START_ID:
660                # Scale timecode to seconds (float)
661                chap.pos = elem.get_value() / 1000000 / 1000.0
662            elif elem_id == MATROSKA_CHAPTER_FLAG_ENABLED_ID:
663                chap.enabled = elem.get_value()
664            elif elem_id == MATROSKA_CHAPTER_DISPLAY_ID:
665                # Matroska supports multiple (chapter name, language) pairs for
666                # each chapter, so chapter names can be internationalized.  This
667                # logic will only take the last one in the list.
668                for display_elem in self.process_one_level(elem):
669                    if display_elem.get_id() == MATROSKA_CHAPTER_STRING_ID:
670                        chap.name = display_elem.get_utf8()
671            elif elem_id == MATROSKA_CHAPTER_UID_ID:
672                self.objects_by_uid[elem.get_value()] = chap
673
674        log.debug('Chapter %r found', chap.name)
675        chap.id = len(self.chapters)
676        self.chapters.append(chap)
677
678
679    def process_attachments(self, attachments):
680        buf = attachments.get_data()
681        index = 0
682        while index < attachments.get_len():
683            elem = EbmlEntity(buf[index:])
684            if elem.get_id() == MATROSKA_ATTACHED_FILE_ID:
685                self.process_attachment(elem)
686            index += elem.get_total_len() + elem.get_crc_len()
687
688
689    def process_attachment(self, attachment):
690        elements = self.process_one_level(attachment)
691        name = desc = mimetype = ""
692        data = None
693
694        for elem in elements:
695            elem_id = elem.get_id()
696            if elem_id == MATROSKA_FILE_NAME_ID:
697                name = elem.get_utf8()
698            elif elem_id == MATROSKA_FILE_DESC_ID:
699                desc = elem.get_utf8()
700            elif elem_id == MATROSKA_FILE_MIME_TYPE_ID:
701                mimetype = elem.get_data()
702            elif elem_id == MATROSKA_FILE_DATA_ID:
703                data = elem.get_data()
704
705        # Right now we only support attachments that could be cover images.
706        # Make a guess to see if this attachment is a cover image.
707        if mimetype.startswith("image/") and "cover" in (name + desc).lower() and data:
708            self.thumbnail = data
709
710        log.debug('Attachment %r found' % name)
711
712
713    def process_tags(self, tags):
714        # Tags spec: http://www.matroska.org/technical/specs/tagging/index.html
715        # Iterate over Tags children.  Tags element children is a
716        # Tag element (whose children are SimpleTags) and a Targets element
717        # whose children specific what objects the tags apply to.
718        for tag_elem in self.process_one_level(tags):
719            # Start a new dict to hold all SimpleTag elements.
720            tags_dict = core.Tags()
721            # A list of target uids this tags dict applies too.  If empty,
722            # tags are global.
723            targets = []
724            for sub_elem in self.process_one_level(tag_elem):
725                if sub_elem.get_id() == MATROSKA_SIMPLE_TAG_ID:
726                    self.process_simple_tag(sub_elem, tags_dict)
727                elif sub_elem.get_id() == MATROSKA_TARGETS_ID:
728                    # Targets element: if there is no uid child (track uid,
729                    # chapter uid, etc.) then the tags dict applies to the
730                    # whole file (top-level Media object).
731                    for target_elem in self.process_one_level(sub_elem):
732                        target_elem_id = target_elem.get_id()
733                        if target_elem_id in (MATRSOKA_TAGS_TRACK_UID_ID, MATRSOKA_TAGS_EDITION_UID_ID,
734                                              MATRSOKA_TAGS_CHAPTER_UID_ID, MATRSOKA_TAGS_ATTACHMENT_UID_ID):
735                            targets.append(target_elem.get_value())
736                        elif target_elem_id == MATROSKA_TARGET_TYPE_VALUE_ID:
737                            # Target types not supported for now.  (Unclear how this
738                            # would fit with kaa.metadata.)
739                            pass
740            if targets:
741                # Assign tags to all listed uids
742                for target in targets:
743                    try:
744                        self.objects_by_uid[target].tags.update(tags_dict)
745                        self.tags_to_attributes(self.objects_by_uid[target], tags_dict)
746                    except KeyError:
747                        log.warning('Tags assigned to unknown/unsupported target uid %d', target)
748            else:
749                self.tags.update(tags_dict)
750                self.tags_to_attributes(self, tags_dict)
751
752
753    def process_simple_tag(self, simple_tag_elem, tags_dict):
754        """
755        Returns a dict representing the Tag element.
756        """
757        name = lang = value = children = None
758        binary = False
759        for elem in self.process_one_level(simple_tag_elem):
760            elem_id = elem.get_id()
761            if elem_id == MATROSKA_TAG_NAME_ID:
762                name = elem.get_utf8().lower()
763            elif elem_id == MATROSKA_TAG_STRING_ID:
764                value = elem.get_utf8()
765            elif elem_id == MATROSKA_TAG_BINARY_ID:
766                value = elem.get_data()
767                binary = True
768            elif elem_id == MATROSKA_TAG_LANGUAGE_ID:
769                lang = elem.get_utf8()
770            elif elem_id == MATROSKA_SIMPLE_TAG_ID:
771                if children is None:
772                    children = core.Tags()
773                self.process_simple_tag(elem, children)
774
775        if children:
776            # Convert ourselves to a Tags object.
777            children.value = value
778            children.langcode = lang
779            value = children
780        else:
781            if name.startswith('date_'):
782                # Try to convert date to a datetime object.
783                value = matroska_date_to_datetime(value)
784            value = core.Tag(value, lang, binary)
785
786        if name in tags_dict:
787            # Multiple items of this tag name.
788            if not isinstance(tags_dict[name], list):
789                # Convert to a list
790                tags_dict[name] = [tags_dict[name]]
791            # Append to list
792            tags_dict[name].append(value)
793        else:
794            tags_dict[name] = value
795
796
797    def tags_to_attributes(self, obj, tags):
798        # Convert tags to core attributes.
799        for name, tag in tags.items():
800            if isinstance(tag, dict):
801                # Nested tags dict, recurse.
802                self.tags_to_attributes(obj, tag)
803                continue
804            elif name not in TAGS_MAP:
805                continue
806
807            attr, filter = TAGS_MAP[name]
808            if attr not in obj._keys and attr not in self._keys:
809                # Tag is not in any core attribute for this object or global,
810                # so skip.
811                continue
812
813            # Pull value out of Tag object or list of Tag objects.
814            value = [item.value for item in tag] if isinstance(tag, list) else tag.value
815            if filter:
816                try:
817                    value = [filter(item) for item in value] if isinstance(value, list) else filter(value)
818                except Exception as e:
819                    log.warning('Failed to convert tag to core attribute: %r', e)
820            # Special handling for tv series recordings. The 'title' tag
821            # can be used for both the series and the episode name. The
822            # same is true for trackno which may refer to the season
823            # and the episode number. Therefore, if we find these
824            # attributes already set we try some guessing.
825            if attr == 'trackno' and getattr(self, attr) is not None:
826                # delete trackno and save season and episode
827                self.season = self.trackno
828                self.episode = value
829                self.trackno = None
830                continue
831            if attr == 'title' and getattr(self, attr) is not None:
832                # store current value of title as series and use current
833                # value of title as title
834                self.series = self.title
835            if attr in obj._keys:
836                setattr(obj, attr, value)
837            else:
838                setattr(self, attr, value)
839
840
841Parser = Matroska
842