1# -*- coding: utf-8 -*- 2# enzyme - Video metadata parser 3# Copyright 2011-2012 Antoine Bertin <diaoulael@gmail.com> 4# Copyright 2003-2006 Thomas Schueppel <stain@acm.org> 5# Copyright 2003-2006 Dirk Meyer <dischi@freevo.org> 6# Copyright 2003-2006 Jason Tackaberry <tack@urandom.ca> 7# 8# This file is part of enzyme. 9# 10# enzyme is free software; you can redistribute it and/or modify it under 11# the terms of the GNU General Public License as published by 12# the Free Software Foundation; either version 3 of the License, or 13# (at your option) any later version. 14# 15# enzyme is distributed in the hope that it will be useful, 16# but WITHOUT ANY WARRANTY; without even the implied warranty of 17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18# GNU General Public License for more details. 19# 20# You should have received a copy of the GNU General Public License 21# along with enzyme. If not, see <http://www.gnu.org/licenses/>. 22from __future__ import absolute_import 23from datetime import datetime 24from .exceptions import ParseError 25from struct import unpack 26from . import core 27import logging 28import re 29 30__all__ = ['Parser'] 31 32 33# get logging object 34log = logging.getLogger(__name__) 35 36# Main IDs for the Matroska streams 37MATROSKA_VIDEO_TRACK = 0x01 38MATROSKA_AUDIO_TRACK = 0x02 39MATROSKA_SUBTITLES_TRACK = 0x11 40 41MATROSKA_HEADER_ID = 0x1A45DFA3 42MATROSKA_TRACKS_ID = 0x1654AE6B 43MATROSKA_CUES_ID = 0x1C53BB6B 44MATROSKA_SEGMENT_ID = 0x18538067 45MATROSKA_SEGMENT_INFO_ID = 0x1549A966 46MATROSKA_CLUSTER_ID = 0x1F43B675 47MATROSKA_VOID_ID = 0xEC 48MATROSKA_CRC_ID = 0xBF 49MATROSKA_TIMECODESCALE_ID = 0x2AD7B1 50MATROSKA_DURATION_ID = 0x4489 51MATROSKA_CRC32_ID = 0xBF 52MATROSKA_TIMECODESCALE_ID = 0x2AD7B1 53MATROSKA_MUXING_APP_ID = 0x4D80 54MATROSKA_WRITING_APP_ID = 0x5741 55MATROSKA_CODEC_ID = 0x86 56MATROSKA_CODEC_PRIVATE_ID = 0x63A2 57MATROSKA_FRAME_DURATION_ID = 0x23E383 58MATROSKA_VIDEO_SETTINGS_ID = 0xE0 59MATROSKA_VIDEO_WIDTH_ID = 0xB0 60MATROSKA_VIDEO_HEIGHT_ID = 0xBA 61MATROSKA_VIDEO_INTERLACED_ID = 0x9A 62MATROSKA_VIDEO_DISPLAY_WIDTH_ID = 0x54B0 63MATROSKA_VIDEO_DISPLAY_HEIGHT_ID = 0x54BA 64MATROSKA_AUDIO_SETTINGS_ID = 0xE1 65MATROSKA_AUDIO_SAMPLERATE_ID = 0xB5 66MATROSKA_AUDIO_CHANNELS_ID = 0x9F 67MATROSKA_TRACK_UID_ID = 0x73C5 68MATROSKA_TRACK_NUMBER_ID = 0xD7 69MATROSKA_TRACK_TYPE_ID = 0x83 70MATROSKA_TRACK_LANGUAGE_ID = 0x22B59C 71MATROSKA_TRACK_OFFSET = 0x537F 72MATROSKA_TRACK_FLAG_DEFAULT_ID = 0x88 73MATROSKA_TRACK_FLAG_ENABLED_ID = 0xB9 74MATROSKA_TITLE_ID = 0x7BA9 75MATROSKA_DATE_UTC_ID = 0x4461 76MATROSKA_NAME_ID = 0x536E 77 78MATROSKA_CHAPTERS_ID = 0x1043A770 79MATROSKA_CHAPTER_UID_ID = 0x73C4 80MATROSKA_EDITION_ENTRY_ID = 0x45B9 81MATROSKA_CHAPTER_ATOM_ID = 0xB6 82MATROSKA_CHAPTER_TIME_START_ID = 0x91 83MATROSKA_CHAPTER_TIME_END_ID = 0x92 84MATROSKA_CHAPTER_FLAG_ENABLED_ID = 0x4598 85MATROSKA_CHAPTER_DISPLAY_ID = 0x80 86MATROSKA_CHAPTER_LANGUAGE_ID = 0x437C 87MATROSKA_CHAPTER_STRING_ID = 0x85 88 89MATROSKA_ATTACHMENTS_ID = 0x1941A469 90MATROSKA_ATTACHED_FILE_ID = 0x61A7 91MATROSKA_FILE_DESC_ID = 0x467E 92MATROSKA_FILE_NAME_ID = 0x466E 93MATROSKA_FILE_MIME_TYPE_ID = 0x4660 94MATROSKA_FILE_DATA_ID = 0x465C 95 96MATROSKA_SEEKHEAD_ID = 0x114D9B74 97MATROSKA_SEEK_ID = 0x4DBB 98MATROSKA_SEEKID_ID = 0x53AB 99MATROSKA_SEEK_POSITION_ID = 0x53AC 100 101MATROSKA_TAGS_ID = 0x1254C367 102MATROSKA_TAG_ID = 0x7373 103MATROSKA_TARGETS_ID = 0x63C0 104MATROSKA_TARGET_TYPE_VALUE_ID = 0x68CA 105MATROSKA_TARGET_TYPE_ID = 0x63CA 106MATRSOKA_TAGS_TRACK_UID_ID = 0x63C5 107MATRSOKA_TAGS_EDITION_UID_ID = 0x63C9 108MATRSOKA_TAGS_CHAPTER_UID_ID = 0x63C4 109MATRSOKA_TAGS_ATTACHMENT_UID_ID = 0x63C6 110MATROSKA_SIMPLE_TAG_ID = 0x67C8 111MATROSKA_TAG_NAME_ID = 0x45A3 112MATROSKA_TAG_LANGUAGE_ID = 0x447A 113MATROSKA_TAG_STRING_ID = 0x4487 114MATROSKA_TAG_BINARY_ID = 0x4485 115 116 117# See mkv spec for details: 118# http://www.matroska.org/technical/specs/index.html 119 120# Map to convert to well known codes 121# http://haali.cs.msu.ru/mkv/codecs.pdf 122FOURCCMap = { 123 'V_THEORA': 'THEO', 124 'V_SNOW': 'SNOW', 125 'V_MPEG4/ISO/ASP': 'MP4V', 126 'V_MPEG4/ISO/AVC': 'AVC1', 127 'A_AC3': 0x2000, 128 'A_MPEG/L3': 0x0055, 129 'A_MPEG/L2': 0x0050, 130 'A_MPEG/L1': 0x0050, 131 'A_DTS': 0x2001, 132 'A_PCM/INT/LIT': 0x0001, 133 'A_PCM/FLOAT/IEEE': 0x003, 134 'A_TTA1': 0x77a1, 135 'A_WAVPACK4': 0x5756, 136 'A_VORBIS': 0x6750, 137 'A_FLAC': 0xF1AC, 138 'A_AAC': 0x00ff, 139 'A_AAC/': 0x00ff 140} 141 142 143def matroska_date_to_datetime(date): 144 """ 145 Converts a date in Matroska's date format to a python datetime object. 146 Returns the given date string if it could not be converted. 147 """ 148 # From the specs: 149 # The fields with dates should have the following format: YYYY-MM-DD 150 # HH:MM:SS.MSS [...] To store less accuracy, you remove items starting 151 # from the right. To store only the year, you would use, "2004". To store 152 # a specific day such as May 1st, 2003, you would use "2003-05-01". 153 format = re.split(r'([-:. ])', '%Y-%m-%d %H:%M:%S.%f') 154 while format: 155 try: 156 return datetime.strptime(date, ''.join(format)) 157 except ValueError: 158 format = format[:-2] 159 return date 160 161 162def matroska_bps_to_bitrate(bps): 163 """ 164 Tries to convert a free-form bps string into a bitrate (bits per second). 165 """ 166 m = re.search('([\d.]+)\s*(\D.*)', bps) 167 if m: 168 bps, suffix = m.groups() 169 if 'kbit' in suffix: 170 return float(bps) * 1024 171 elif 'kbyte' in suffix: 172 return float(bps) * 1024 * 8 173 elif 'byte' in suffix: 174 return float(bps) * 8 175 elif 'bps' in suffix or 'bit' in suffix: 176 return float(bps) 177 if bps.replace('.', '').isdigit(): 178 if float(bps) < 30000: 179 # Assume kilobits and convert to bps 180 return float(bps) * 1024 181 return float(bps) 182 183 184# Used to convert the official matroska tag names (only lower-cased) to core 185# attributes. tag name -> attr, filter 186TAGS_MAP = { 187 # From Media core 188 'title': ('title', None), 189 'subtitle': ('caption', None), 190 'comment': ('comment', None), 191 'url': ('url', None), 192 'artist': ('artist', None), 193 'keywords': ('keywords', lambda s: [word.strip() for word in s.split(',')]), 194 'composer_nationality': ('country', None), 195 'date_released': ('datetime', None), 196 'date_recorded': ('datetime', None), 197 'date_written': ('datetime', None), 198 199 # From Video core 200 'encoder': ('encoder', None), 201 'bps': ('bitrate', matroska_bps_to_bitrate), 202 'part_number': ('trackno', int), 203 'total_parts': ('trackof', int), 204 'copyright': ('copyright', None), 205 'genre': ('genre', None), 206 'actor': ('actors', None), 207 'written_by': ('writer', None), 208 'producer': ('producer', None), 209 'production_studio': ('studio', None), 210 'law_rating': ('rating', None), 211 'summary': ('summary', None), 212 'synopsis': ('synopsis', None), 213} 214 215 216class EbmlEntity: 217 """ 218 This is class that is responsible to handle one Ebml entity as described in 219 the Matroska/Ebml spec 220 """ 221 def __init__(self, inbuf): 222 # Compute the EBML id 223 # Set the CRC len to zero 224 self.crc_len = 0 225 # Now loop until we find an entity without CRC 226 try: 227 self.build_entity(inbuf) 228 except IndexError: 229 raise ParseError() 230 while self.get_id() == MATROSKA_CRC32_ID: 231 self.crc_len += self.get_total_len() 232 inbuf = inbuf[self.get_total_len():] 233 self.build_entity(inbuf) 234 235 def build_entity(self, inbuf): 236 self.compute_id(inbuf) 237 238 if self.id_len == 0: 239 log.error('EBML entity not found, bad file format') 240 raise ParseError() 241 242 self.entity_len, self.len_size = self.compute_len(inbuf[self.id_len:]) 243 self.entity_data = inbuf[self.get_header_len() : self.get_total_len()] 244 self.ebml_length = self.entity_len 245 self.entity_len = min(len(self.entity_data), self.entity_len) 246 247 # if the data size is 8 or less, it could be a numeric value 248 self.value = 0 249 if self.entity_len <= 8: 250 for pos, shift in zip(range(self.entity_len), range((self.entity_len - 1) * 8, -1, -8)): 251 self.value |= ord(self.entity_data[pos]) << shift 252 253 254 def add_data(self, data): 255 maxlen = self.ebml_length - len(self.entity_data) 256 if maxlen <= 0: 257 return 258 self.entity_data += data[:maxlen] 259 self.entity_len = len(self.entity_data) 260 261 262 def compute_id(self, inbuf): 263 self.id_len = 0 264 if len(inbuf) < 1: 265 return 0 266 first = ord(inbuf[0]) 267 if first & 0x80: 268 self.id_len = 1 269 self.entity_id = first 270 elif first & 0x40: 271 if len(inbuf) < 2: 272 return 0 273 self.id_len = 2 274 self.entity_id = ord(inbuf[0]) << 8 | ord(inbuf[1]) 275 elif first & 0x20: 276 if len(inbuf) < 3: 277 return 0 278 self.id_len = 3 279 self.entity_id = (ord(inbuf[0]) << 16) | (ord(inbuf[1]) << 8) | \ 280 (ord(inbuf[2])) 281 elif first & 0x10: 282 if len(inbuf) < 4: 283 return 0 284 self.id_len = 4 285 self.entity_id = (ord(inbuf[0]) << 24) | (ord(inbuf[1]) << 16) | \ 286 (ord(inbuf[2]) << 8) | (ord(inbuf[3])) 287 self.entity_str = inbuf[0:self.id_len] 288 289 290 def compute_len(self, inbuf): 291 if not inbuf: 292 return 0, 0 293 i = num_ffs = 0 294 len_mask = 0x80 295 len = ord(inbuf[0]) 296 while not len & len_mask: 297 i += 1 298 len_mask >>= 1 299 if i >= 8: 300 return 0, 0 301 302 len &= len_mask - 1 303 if len == len_mask - 1: 304 num_ffs += 1 305 for p in range(i): 306 len = (len << 8) | ord(inbuf[p + 1]) 307 if len & 0xff == 0xff: 308 num_ffs += 1 309 if num_ffs == i + 1: 310 len = 0 311 return len, i + 1 312 313 314 def get_crc_len(self): 315 return self.crc_len 316 317 318 def get_value(self): 319 return self.value 320 321 322 def get_float_value(self): 323 if len(self.entity_data) == 4: 324 return unpack('!f', self.entity_data)[0] 325 elif len(self.entity_data) == 8: 326 return unpack('!d', self.entity_data)[0] 327 return 0.0 328 329 330 def get_data(self): 331 return self.entity_data 332 333 334 def get_utf8(self): 335 return unicode(self.entity_data, 'utf-8', 'replace') 336 337 338 def get_str(self): 339 return unicode(self.entity_data, 'ascii', 'replace') 340 341 342 def get_id(self): 343 return self.entity_id 344 345 346 def get_str_id(self): 347 return self.entity_str 348 349 350 def get_len(self): 351 return self.entity_len 352 353 354 def get_total_len(self): 355 return self.entity_len + self.id_len + self.len_size 356 357 358 def get_header_len(self): 359 return self.id_len + self.len_size 360 361 362 363class Matroska(core.AVContainer): 364 """ 365 Matroska video and audio parser. If at least one video stream is 366 detected it will set the type to MEDIA_AV. 367 """ 368 def __init__(self, file): 369 core.AVContainer.__init__(self) 370 self.samplerate = 1 371 372 self.file = file 373 # Read enough that we're likely to get the full seekhead (FIXME: kludge) 374 buffer = file.read(2000) 375 if len(buffer) == 0: 376 # Regular File end 377 raise ParseError() 378 379 # Check the Matroska header 380 header = EbmlEntity(buffer) 381 if header.get_id() != MATROSKA_HEADER_ID: 382 raise ParseError() 383 384 log.debug('HEADER ID found %08X' % header.get_id()) 385 self.mime = 'video/x-matroska' 386 self.type = 'Matroska' 387 self.has_idx = False 388 self.objects_by_uid = {} 389 390 # Now get the segment 391 self.segment = segment = EbmlEntity(buffer[header.get_total_len():]) 392 # Record file offset of segment data for seekheads 393 self.segment.offset = header.get_total_len() + segment.get_header_len() 394 if segment.get_id() != MATROSKA_SEGMENT_ID: 395 log.debug('SEGMENT ID not found %08X' % segment.get_id()) 396 return 397 398 log.debug('SEGMENT ID found %08X' % segment.get_id()) 399 try: 400 for elem in self.process_one_level(segment): 401 if elem.get_id() == MATROSKA_SEEKHEAD_ID: 402 self.process_elem(elem) 403 except ParseError: 404 pass 405 406 if not self.has_idx: 407 log.warning('File has no index') 408 self._set('corrupt', True) 409 410 def process_elem(self, elem): 411 elem_id = elem.get_id() 412 log.debug('BEGIN: process element %r' % hex(elem_id)) 413 if elem_id == MATROSKA_SEGMENT_INFO_ID: 414 duration = 0 415 scalecode = 1000000.0 416 417 for ielem in self.process_one_level(elem): 418 ielem_id = ielem.get_id() 419 if ielem_id == MATROSKA_TIMECODESCALE_ID: 420 scalecode = ielem.get_value() 421 elif ielem_id == MATROSKA_DURATION_ID: 422 duration = ielem.get_float_value() 423 elif ielem_id == MATROSKA_TITLE_ID: 424 self.title = ielem.get_utf8() 425 elif ielem_id == MATROSKA_DATE_UTC_ID: 426 timestamp = unpack('!q', ielem.get_data())[0] / 10.0 ** 9 427 # Date is offset 2001-01-01 00:00:00 (timestamp 978307200.0) 428 self.timestamp = int(timestamp + 978307200) 429 430 self.length = duration * scalecode / 1000000000.0 431 432 elif elem_id == MATROSKA_TRACKS_ID: 433 self.process_tracks(elem) 434 435 elif elem_id == MATROSKA_CHAPTERS_ID: 436 self.process_chapters(elem) 437 438 elif elem_id == MATROSKA_ATTACHMENTS_ID: 439 self.process_attachments(elem) 440 441 elif elem_id == MATROSKA_SEEKHEAD_ID: 442 self.process_seekhead(elem) 443 444 elif elem_id == MATROSKA_TAGS_ID: 445 self.process_tags(elem) 446 447 elif elem_id == MATROSKA_CUES_ID: 448 self.has_idx = True 449 450 log.debug('END: process element %r' % hex(elem_id)) 451 return True 452 453 454 def process_seekhead(self, elem): 455 for seek_elem in self.process_one_level(elem): 456 if seek_elem.get_id() != MATROSKA_SEEK_ID: 457 continue 458 for sub_elem in self.process_one_level(seek_elem): 459 if sub_elem.get_id() == MATROSKA_SEEKID_ID: 460 if sub_elem.get_value() == MATROSKA_CLUSTER_ID: 461 # Not interested in these. 462 return 463 464 elif sub_elem.get_id() == MATROSKA_SEEK_POSITION_ID: 465 self.file.seek(self.segment.offset + sub_elem.get_value()) 466 buffer = self.file.read(100) 467 try: 468 elem = EbmlEntity(buffer) 469 except ParseError: 470 continue 471 472 # Fetch all data necessary for this element. 473 elem.add_data(self.file.read(elem.ebml_length)) 474 self.process_elem(elem) 475 476 477 def process_tracks(self, tracks): 478 tracksbuf = tracks.get_data() 479 index = 0 480 while index < tracks.get_len(): 481 trackelem = EbmlEntity(tracksbuf[index:]) 482 log.debug ('ELEMENT %X found' % trackelem.get_id()) 483 self.process_track(trackelem) 484 index += trackelem.get_total_len() + trackelem.get_crc_len() 485 486 487 def process_one_level(self, item): 488 buf = item.get_data() 489 index = 0 490 while index < item.get_len(): 491 if len(buf[index:]) == 0: 492 break 493 elem = EbmlEntity(buf[index:]) 494 yield elem 495 index += elem.get_total_len() + elem.get_crc_len() 496 497 def set_track_defaults(self, track): 498 track.language = 'eng' 499 500 def process_track(self, track): 501 # Collapse generator into a list since we need to iterate over it 502 # twice. 503 elements = [x for x in self.process_one_level(track)] 504 track_type = [x.get_value() for x in elements if x.get_id() == MATROSKA_TRACK_TYPE_ID] 505 if not track_type: 506 log.debug('Bad track: no type id found') 507 return 508 509 track_type = track_type[0] 510 track = None 511 512 if track_type == MATROSKA_VIDEO_TRACK: 513 log.debug('Video track found') 514 track = self.process_video_track(elements) 515 elif track_type == MATROSKA_AUDIO_TRACK: 516 log.debug('Audio track found') 517 track = self.process_audio_track(elements) 518 elif track_type == MATROSKA_SUBTITLES_TRACK: 519 log.debug('Subtitle track found') 520 track = core.Subtitle() 521 self.set_track_defaults(track) 522 track.id = len(self.subtitles) 523 self.subtitles.append(track) 524 for elem in elements: 525 self.process_track_common(elem, track) 526 527 528 def process_track_common(self, elem, track): 529 elem_id = elem.get_id() 530 if elem_id == MATROSKA_TRACK_LANGUAGE_ID: 531 track.language = elem.get_str() 532 log.debug('Track language found: %r' % track.language) 533 elif elem_id == MATROSKA_NAME_ID: 534 track.title = elem.get_utf8() 535 elif elem_id == MATROSKA_TRACK_NUMBER_ID: 536 track.trackno = elem.get_value() 537 elif elem_id == MATROSKA_TRACK_FLAG_ENABLED_ID: 538 track.enabled = bool(elem.get_value()) 539 elif elem_id == MATROSKA_TRACK_FLAG_DEFAULT_ID: 540 track.default = bool(elem.get_value()) 541 elif elem_id == MATROSKA_CODEC_ID: 542 track.codec = elem.get_str() 543 elif elem_id == MATROSKA_CODEC_PRIVATE_ID: 544 track.codec_private = elem.get_data() 545 elif elem_id == MATROSKA_TRACK_UID_ID: 546 self.objects_by_uid[elem.get_value()] = track 547 548 549 def process_video_track(self, elements): 550 track = core.VideoStream() 551 # Defaults 552 track.codec = 'Unknown' 553 track.fps = 0 554 self.set_track_defaults(track) 555 556 for elem in elements: 557 elem_id = elem.get_id() 558 if elem_id == MATROSKA_CODEC_ID: 559 track.codec = elem.get_str() 560 561 elif elem_id == MATROSKA_FRAME_DURATION_ID: 562 try: 563 track.fps = 1 / (pow(10, -9) * (elem.get_value())) 564 except ZeroDivisionError: 565 pass 566 567 elif elem_id == MATROSKA_VIDEO_SETTINGS_ID: 568 d_width = d_height = None 569 for settings_elem in self.process_one_level(elem): 570 settings_elem_id = settings_elem.get_id() 571 if settings_elem_id == MATROSKA_VIDEO_WIDTH_ID: 572 track.width = settings_elem.get_value() 573 elif settings_elem_id == MATROSKA_VIDEO_HEIGHT_ID: 574 track.height = settings_elem.get_value() 575 elif settings_elem_id == MATROSKA_VIDEO_DISPLAY_WIDTH_ID: 576 d_width = settings_elem.get_value() 577 elif settings_elem_id == MATROSKA_VIDEO_DISPLAY_HEIGHT_ID: 578 d_height = settings_elem.get_value() 579 elif settings_elem_id == MATROSKA_VIDEO_INTERLACED_ID: 580 value = int(settings_elem.get_value()) 581 self._set('interlaced', value) 582 583 if None not in [d_width, d_height]: 584 track.aspect = float(d_width) / d_height 585 586 else: 587 self.process_track_common(elem, track) 588 589 # convert codec information 590 # http://haali.cs.msu.ru/mkv/codecs.pdf 591 if track.codec in FOURCCMap: 592 track.codec = FOURCCMap[track.codec] 593 elif '/' in track.codec and track.codec.split('/')[0] + '/' in FOURCCMap: 594 track.codec = FOURCCMap[track.codec.split('/')[0] + '/'] 595 elif track.codec.endswith('FOURCC') and len(track.codec_private or '') == 40: 596 track.codec = track.codec_private[16:20] 597 elif track.codec.startswith('V_REAL/'): 598 track.codec = track.codec[7:] 599 elif track.codec.startswith('V_'): 600 # FIXME: add more video codecs here 601 track.codec = track.codec[2:] 602 603 track.id = len(self.video) 604 self.video.append(track) 605 return track 606 607 608 def process_audio_track(self, elements): 609 track = core.AudioStream() 610 track.codec = 'Unknown' 611 self.set_track_defaults(track) 612 613 for elem in elements: 614 elem_id = elem.get_id() 615 if elem_id == MATROSKA_CODEC_ID: 616 track.codec = elem.get_str() 617 elif elem_id == MATROSKA_AUDIO_SETTINGS_ID: 618 for settings_elem in self.process_one_level(elem): 619 settings_elem_id = settings_elem.get_id() 620 if settings_elem_id == MATROSKA_AUDIO_SAMPLERATE_ID: 621 track.samplerate = settings_elem.get_float_value() 622 elif settings_elem_id == MATROSKA_AUDIO_CHANNELS_ID: 623 track.channels = settings_elem.get_value() 624 else: 625 self.process_track_common(elem, track) 626 627 628 if track.codec in FOURCCMap: 629 track.codec = FOURCCMap[track.codec] 630 elif '/' in track.codec and track.codec.split('/')[0] + '/' in FOURCCMap: 631 track.codec = FOURCCMap[track.codec.split('/')[0] + '/'] 632 elif track.codec.startswith('A_'): 633 track.codec = track.codec[2:] 634 635 track.id = len(self.audio) 636 self.audio.append(track) 637 return track 638 639 640 def process_chapters(self, chapters): 641 elements = self.process_one_level(chapters) 642 for elem in elements: 643 if elem.get_id() == MATROSKA_EDITION_ENTRY_ID: 644 buf = elem.get_data() 645 index = 0 646 while index < elem.get_len(): 647 sub_elem = EbmlEntity(buf[index:]) 648 if sub_elem.get_id() == MATROSKA_CHAPTER_ATOM_ID: 649 self.process_chapter_atom(sub_elem) 650 index += sub_elem.get_total_len() + sub_elem.get_crc_len() 651 652 653 def process_chapter_atom(self, atom): 654 elements = self.process_one_level(atom) 655 chap = core.Chapter() 656 657 for elem in elements: 658 elem_id = elem.get_id() 659 if elem_id == MATROSKA_CHAPTER_TIME_START_ID: 660 # Scale timecode to seconds (float) 661 chap.pos = elem.get_value() / 1000000 / 1000.0 662 elif elem_id == MATROSKA_CHAPTER_FLAG_ENABLED_ID: 663 chap.enabled = elem.get_value() 664 elif elem_id == MATROSKA_CHAPTER_DISPLAY_ID: 665 # Matroska supports multiple (chapter name, language) pairs for 666 # each chapter, so chapter names can be internationalized. This 667 # logic will only take the last one in the list. 668 for display_elem in self.process_one_level(elem): 669 if display_elem.get_id() == MATROSKA_CHAPTER_STRING_ID: 670 chap.name = display_elem.get_utf8() 671 elif elem_id == MATROSKA_CHAPTER_UID_ID: 672 self.objects_by_uid[elem.get_value()] = chap 673 674 log.debug('Chapter %r found', chap.name) 675 chap.id = len(self.chapters) 676 self.chapters.append(chap) 677 678 679 def process_attachments(self, attachments): 680 buf = attachments.get_data() 681 index = 0 682 while index < attachments.get_len(): 683 elem = EbmlEntity(buf[index:]) 684 if elem.get_id() == MATROSKA_ATTACHED_FILE_ID: 685 self.process_attachment(elem) 686 index += elem.get_total_len() + elem.get_crc_len() 687 688 689 def process_attachment(self, attachment): 690 elements = self.process_one_level(attachment) 691 name = desc = mimetype = "" 692 data = None 693 694 for elem in elements: 695 elem_id = elem.get_id() 696 if elem_id == MATROSKA_FILE_NAME_ID: 697 name = elem.get_utf8() 698 elif elem_id == MATROSKA_FILE_DESC_ID: 699 desc = elem.get_utf8() 700 elif elem_id == MATROSKA_FILE_MIME_TYPE_ID: 701 mimetype = elem.get_data() 702 elif elem_id == MATROSKA_FILE_DATA_ID: 703 data = elem.get_data() 704 705 # Right now we only support attachments that could be cover images. 706 # Make a guess to see if this attachment is a cover image. 707 if mimetype.startswith("image/") and "cover" in (name + desc).lower() and data: 708 self.thumbnail = data 709 710 log.debug('Attachment %r found' % name) 711 712 713 def process_tags(self, tags): 714 # Tags spec: http://www.matroska.org/technical/specs/tagging/index.html 715 # Iterate over Tags children. Tags element children is a 716 # Tag element (whose children are SimpleTags) and a Targets element 717 # whose children specific what objects the tags apply to. 718 for tag_elem in self.process_one_level(tags): 719 # Start a new dict to hold all SimpleTag elements. 720 tags_dict = core.Tags() 721 # A list of target uids this tags dict applies too. If empty, 722 # tags are global. 723 targets = [] 724 for sub_elem in self.process_one_level(tag_elem): 725 if sub_elem.get_id() == MATROSKA_SIMPLE_TAG_ID: 726 self.process_simple_tag(sub_elem, tags_dict) 727 elif sub_elem.get_id() == MATROSKA_TARGETS_ID: 728 # Targets element: if there is no uid child (track uid, 729 # chapter uid, etc.) then the tags dict applies to the 730 # whole file (top-level Media object). 731 for target_elem in self.process_one_level(sub_elem): 732 target_elem_id = target_elem.get_id() 733 if target_elem_id in (MATRSOKA_TAGS_TRACK_UID_ID, MATRSOKA_TAGS_EDITION_UID_ID, 734 MATRSOKA_TAGS_CHAPTER_UID_ID, MATRSOKA_TAGS_ATTACHMENT_UID_ID): 735 targets.append(target_elem.get_value()) 736 elif target_elem_id == MATROSKA_TARGET_TYPE_VALUE_ID: 737 # Target types not supported for now. (Unclear how this 738 # would fit with kaa.metadata.) 739 pass 740 if targets: 741 # Assign tags to all listed uids 742 for target in targets: 743 try: 744 self.objects_by_uid[target].tags.update(tags_dict) 745 self.tags_to_attributes(self.objects_by_uid[target], tags_dict) 746 except KeyError: 747 log.warning('Tags assigned to unknown/unsupported target uid %d', target) 748 else: 749 self.tags.update(tags_dict) 750 self.tags_to_attributes(self, tags_dict) 751 752 753 def process_simple_tag(self, simple_tag_elem, tags_dict): 754 """ 755 Returns a dict representing the Tag element. 756 """ 757 name = lang = value = children = None 758 binary = False 759 for elem in self.process_one_level(simple_tag_elem): 760 elem_id = elem.get_id() 761 if elem_id == MATROSKA_TAG_NAME_ID: 762 name = elem.get_utf8().lower() 763 elif elem_id == MATROSKA_TAG_STRING_ID: 764 value = elem.get_utf8() 765 elif elem_id == MATROSKA_TAG_BINARY_ID: 766 value = elem.get_data() 767 binary = True 768 elif elem_id == MATROSKA_TAG_LANGUAGE_ID: 769 lang = elem.get_utf8() 770 elif elem_id == MATROSKA_SIMPLE_TAG_ID: 771 if children is None: 772 children = core.Tags() 773 self.process_simple_tag(elem, children) 774 775 if children: 776 # Convert ourselves to a Tags object. 777 children.value = value 778 children.langcode = lang 779 value = children 780 else: 781 if name.startswith('date_'): 782 # Try to convert date to a datetime object. 783 value = matroska_date_to_datetime(value) 784 value = core.Tag(value, lang, binary) 785 786 if name in tags_dict: 787 # Multiple items of this tag name. 788 if not isinstance(tags_dict[name], list): 789 # Convert to a list 790 tags_dict[name] = [tags_dict[name]] 791 # Append to list 792 tags_dict[name].append(value) 793 else: 794 tags_dict[name] = value 795 796 797 def tags_to_attributes(self, obj, tags): 798 # Convert tags to core attributes. 799 for name, tag in tags.items(): 800 if isinstance(tag, dict): 801 # Nested tags dict, recurse. 802 self.tags_to_attributes(obj, tag) 803 continue 804 elif name not in TAGS_MAP: 805 continue 806 807 attr, filter = TAGS_MAP[name] 808 if attr not in obj._keys and attr not in self._keys: 809 # Tag is not in any core attribute for this object or global, 810 # so skip. 811 continue 812 813 # Pull value out of Tag object or list of Tag objects. 814 value = [item.value for item in tag] if isinstance(tag, list) else tag.value 815 if filter: 816 try: 817 value = [filter(item) for item in value] if isinstance(value, list) else filter(value) 818 except Exception as e: 819 log.warning('Failed to convert tag to core attribute: %r', e) 820 # Special handling for tv series recordings. The 'title' tag 821 # can be used for both the series and the episode name. The 822 # same is true for trackno which may refer to the season 823 # and the episode number. Therefore, if we find these 824 # attributes already set we try some guessing. 825 if attr == 'trackno' and getattr(self, attr) is not None: 826 # delete trackno and save season and episode 827 self.season = self.trackno 828 self.episode = value 829 self.trackno = None 830 continue 831 if attr == 'title' and getattr(self, attr) is not None: 832 # store current value of title as series and use current 833 # value of title as title 834 self.series = self.title 835 if attr in obj._keys: 836 setattr(obj, attr, value) 837 else: 838 setattr(self, attr, value) 839 840 841Parser = Matroska 842