1#!/usr/local/bin/python3.8 2# -*- coding: iso-8859-1 -*- 3# kate: replace-tabs off; 4# *************************************************************************** 5# copyright : (C) 2006-2010 by Mathias Monnerville 6# email : tellico@monnerville.com 7# *************************************************************************** 8# 9# *************************************************************************** 10# * * 11# * This program is free software; you can redistribute it and/or modify * 12# * it under the terms of version 2 of the GNU General Public License as * 13# * published by the Free Software Foundation; * 14# * * 15# *************************************************************************** 16# 17# Version 0.7.3: 2010-12-07 (Reported by Romain Henriet) 18# * Fixed some regexp issues 19# * Better handling of image parsing/fetching errors 20# 21# Version 0.7.2.1: 2010-07-27 (Reported by Romain Henriet) 22# * Updated title match to allow searching without diacritical marks 23# 24# Version 0.7.2: 2010-05-27 (Reported by Romain Henriet) 25# * Fixed bug preventing searches with accent marks 26# * Added post-processing cleanup action to replace raw HTML entities with 27# their ISO Latin-1 replacement text 28# 29# Version 0.7.1: 2010-04-26 (Thanks to Romain Henriet <romain-devel@laposte.net>) 30# * Fixed greedy regexp for genre. Fixed nationality output. Add studio. 31# 32# Version 0.7: 2009-11-12 33# * Allocine has a brand new website. All regexps were broken. 34# 35# Version 0.6: 2009-03-04 (Thanks to R. Fischer and Henry-Nicolas Tourneur) 36# * Fixed parsing issues (various RegExp issues due to allocine's HTML changes) 37# 38# Version 0.5: 2009-01-21 (Changes contributed by R. Fischer <fischer.tellico@free.fr>) 39# * Added complete distribution of actors and roles, Genres, Nationalities, producers, composer and scenarist 40# * Fixed the plot field that returned a wrong answer when no plot is available 41# * Fixed a bug related to parameters encoding 42# 43# Version 0.4: 44# * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres 45# could not be retrieved. Fixed bad http request error due to some changes in HTML code. 46# 47# Version 0.3: 48# * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed. 49# 50# Version 0.2: 51# * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore. 52# 53# Version 0.1: 54# * Initial release. 55 56import sys, os, re, hashlib, random, types 57import urllib, time, base64 58import xml.dom.minidom 59import locale 60try: 61 import htmlentitydefs as htmlents 62except ImportError: 63 try: 64 from html.entities import entitydefs as htmlents 65 except ImportError: 66 print('Python 2.5+ required') 67 raise 68 69try: 70 # For Python 3.0 and later 71 from urllib.request import urlopen 72except ImportError: 73 # Fall back to Python 2's urllib2 74 from urllib2 import urlopen 75 76XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>""" 77DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">""" 78 79VERSION = "0.7.3" 80 81def genMD5(): 82 float = random.random() 83 return hashlib.md5(str(float)).hexdigest() 84 85class BasicTellicoDOM: 86 def __init__(self): 87 self.__doc = xml.dom.minidom.Document() 88 self.__root = self.__doc.createElement('tellico') 89 self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/') 90 self.__root.setAttribute('syntaxVersion', '9') 91 92 self.__collection = self.__doc.createElement('collection') 93 self.__collection.setAttribute('title', 'My Movies') 94 self.__collection.setAttribute('type', '3') 95 96 self.__fields = self.__doc.createElement('fields') 97 # Add all default (standard) fields 98 self.__dfltField = self.__doc.createElement('field') 99 self.__dfltField.setAttribute('name', '_default') 100 101 # Add a custom 'Collection' field 102 self.__customField = self.__doc.createElement('field') 103 self.__customField.setAttribute('name', 'titre-original') 104 self.__customField.setAttribute('title', 'Original Title') 105 self.__customField.setAttribute('flags', '0') 106 self.__customField.setAttribute('category', unicode('G�n�ral', 'latin-1').encode('utf-8')) 107 self.__customField.setAttribute('format', '1') 108 self.__customField.setAttribute('type', '1') 109 self.__customField.setAttribute('i18n', 'yes') 110 111 self.__fields.appendChild(self.__dfltField) 112 self.__fields.appendChild(self.__customField) 113 self.__collection.appendChild(self.__fields) 114 115 self.__images = self.__doc.createElement('images') 116 117 self.__root.appendChild(self.__collection) 118 self.__doc.appendChild(self.__root) 119 120 # Current movie id 121 self.__currentId = 0 122 123 124 def addEntry(self, movieData): 125 """ 126 Add a movie entry 127 """ 128 d = movieData 129 entryNode = self.__doc.createElement('entry') 130 entryNode.setAttribute('id', str(self.__currentId)) 131 132 titleNode = self.__doc.createElement('title') 133 titleNode.appendChild(self.__doc.createTextNode(d['title'])) 134 135 otitleNode = self.__doc.createElement('titre-original') 136 otitleNode.appendChild(self.__doc.createTextNode(d['otitle'])) 137 138 yearNode = self.__doc.createElement('year') 139 yearNode.appendChild(self.__doc.createTextNode(d['year'])) 140 141 genresNode = self.__doc.createElement('genres') 142 for g in d['genres']: 143 genreNode = self.__doc.createElement('genre') 144 genreNode.appendChild(self.__doc.createTextNode(g)) 145 genresNode.appendChild(genreNode) 146 147 studsNode = self.__doc.createElement('studios') 148 for g in d['studio']: 149 studNode = self.__doc.createElement('studio') 150 studNode.appendChild(self.__doc.createTextNode(g)) 151 studsNode.appendChild(studNode) 152 153 natsNode = self.__doc.createElement('nationalitys') 154 for g in d['nat']: 155 natNode = self.__doc.createElement('nationality') 156 natNode.appendChild(self.__doc.createTextNode(g)) 157 natsNode.appendChild(natNode) 158 159 castsNode = self.__doc.createElement('casts') 160 i = 0 161 while i < len(d['actors']): 162 g = d['actors'][i] 163 h = d['actors'][i+1] 164 castNode = self.__doc.createElement('cast') 165 col1Node = self.__doc.createElement('column') 166 col2Node = self.__doc.createElement('column') 167 col1Node.appendChild(self.__doc.createTextNode(g)) 168 col2Node.appendChild(self.__doc.createTextNode(h)) 169 castNode.appendChild(col1Node) 170 castNode.appendChild(col2Node) 171 castsNode.appendChild(castNode) 172 i = i + 2 173 174 dirsNode = self.__doc.createElement('directors') 175 for g in d['dirs']: 176 dirNode = self.__doc.createElement('director') 177 dirNode.appendChild(self.__doc.createTextNode(g)) 178 dirsNode.appendChild(dirNode) 179 180 prodsNode = self.__doc.createElement('producers') 181 for g in d['prods']: 182 prodNode = self.__doc.createElement('producer') 183 prodNode.appendChild(self.__doc.createTextNode(g)) 184 prodsNode.appendChild(prodNode) 185 186 scensNode = self.__doc.createElement('writers') 187 for g in d['scens']: 188 scenNode = self.__doc.createElement('writer') 189 scenNode.appendChild(self.__doc.createTextNode(g)) 190 scensNode.appendChild(scenNode) 191 192 compsNode = self.__doc.createElement('composers') 193 for g in d['comps']: 194 compNode = self.__doc.createElement('composer') 195 compNode.appendChild(self.__doc.createTextNode(g)) 196 compsNode.appendChild(compNode) 197 198 timeNode = self.__doc.createElement('running-time') 199 timeNode.appendChild(self.__doc.createTextNode(d['time'])) 200 201 allocineNode = self.__doc.createElement(unicode('allocin�-link', 'latin-1').encode('utf-8')) 202 allocineNode.appendChild(self.__doc.createTextNode(d['allocine'])) 203 204 plotNode = self.__doc.createElement('plot') 205 plotNode.appendChild(self.__doc.createTextNode(d['plot'])) 206 207 if d['image']: 208 imageNode = self.__doc.createElement('image') 209 imageNode.setAttribute('format', 'JPEG') 210 imageNode.setAttribute('id', d['image'][0]) 211 imageNode.setAttribute('width', '120') 212 imageNode.setAttribute('height', '160') 213 imageNode.appendChild(self.__doc.createTextNode(d['image'][1])) 214 215 coverNode = self.__doc.createElement('cover') 216 coverNode.appendChild(self.__doc.createTextNode(d['image'][0])) 217 218 for name in ( 'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'studsNode', 'natsNode', 219 'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode', 220 'prodsNode', 'compsNode', 'scensNode' ): 221 entryNode.appendChild(eval(name)) 222 223 if d['image']: 224 entryNode.appendChild(coverNode) 225 self.__images.appendChild(imageNode) 226 227 self.__collection.appendChild(entryNode) 228 self.__currentId += 1 229 230 def printXML(self): 231 """ 232 Outputs XML content to stdout 233 """ 234 self.__collection.appendChild(self.__images) 235 print(XML_HEADER); 236 print(DOCTYPE) 237 print(self.__root.toxml()) 238 239 240class AlloCineParser: 241 def __init__(self): 242 self.__baseURL = 'http://www.allocine.fr' 243 self.__basePath = '/film/fichefilm_gen_cfilm' 244 self.__castPath = '/film/casting_gen_cfilm' 245 self.__searchURL= 'http://www.allocine.fr/recherche/?q=%s' 246 self.__movieURL = self.__baseURL + self.__basePath 247 self.__castURL = self.__baseURL + self.__castPath 248 249 # Define some regexps 250 self.__regExps = { 251 'title' : '<div id="title.*?<span.*?>(?P<title>.+?)</span>', 252 'dirs' : """alis.*?par.*?<a.*?><span.*?>(?P<step1>.+?)</span></a>""", 253 'nat' : 'Nationalit.*?</span>(?P<nat>.+?)</td', 254 'genres' : '<span class="lighten">.*?Genre.*?</span>(?P<step1>.+?)</td', 255 'studio' : 'Distributeur</div>(?P<step1>.+?)</td', 256 'time' : 'Dur.*?e *?:*?.*?(?P<hours>[0-9])h *(?P<mins>[0-9]*).*?Ann', 257 'year' : 'Ann.*?e de production.*?<span.*?>(?P<year>[0-9]{4})</span>', 258 'otitle' : 'Titre original *?:*?.*?<td>(?P<otitle>.+?)</td>', 259 'plot' : '<p itemprop="description">(?P<plot>.*?)</p>', 260 'image' : '<div class="poster">.*?<img src=\'(?P<image>http://.+?)\'.?', 261 } 262 263 self.__castRegExps = { 264# 'roleactor' : '<li.*?itemprop="actors".*?>.*?<span itemprop="name">(.*?)</span>.*?<p>.*?R.*?le : (?P<role>.*?)</p>.*?</li>', 265 'roleactor' : '<li.*?\/personne\/.*?">(.*?)</span>.*?<p.*?R.*?le : (?P<role>.*?)</p>.*?</li', 266 'prods' : '<td>[\r\n\t]*Producteur[\r\n\t]*</td>.*?<span.*?>(.*?)</span>', 267 'scens' : '<td>[\r\n\t]*Sc.*?nariste[\r\n\t]*</td>.*?<span.*?>(.*?)</span>', 268 'comps' : '<td>[\r\n\t]*Compositeur[\r\n\t]*</td>.*?<span.*?>(.*?)</span>', 269 } 270 271 self.__domTree = BasicTellicoDOM() 272 273 def run(self, title): 274 """ 275 Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree 276 to stdout (in tellico format) so that tellico can use it. 277 """ 278 # the script needs the search string to be encoded in utf-8 279 try: 280 # first try system encoding 281 title = unicode(title, sys.stdin.encoding or sys.getdefaultencoding()) 282 except UnicodeDecodeError: 283 # on failure, fallback to 'latin-1' 284 title = unicode(title, 'latin-1') 285 286 # now encode for urllib 287 title = title.encode('utf-8') 288 self.__getMovie(title) 289 # Print results to stdout 290 self.__domTree.printXML() 291 292 def __getHTMLContent(self, url): 293 """ 294 Fetch HTML data from url 295 """ 296 297 u = urlopen(url) 298 self.__data = u.read() 299 u.close() 300 301 def __fetchMovieLinks(self, title): 302 """ 303 Retrieve all links related to movie 304 @param title Movie title 305 """ 306 tmp = re.findall("""<td.*?class=['"]totalwidth['"]>.*?<a *href=['"]%s=(?P<page>.*?\.html?)['"] *?>(?P<title>.*?)</a>""" % self.__basePath, self.__data, re.S | re.I) 307 matchList = [] 308 for match in tmp: 309 name = re.sub(r'([\r\n]+|<b>|</b>)', '', match[1]) 310 name = re.sub(r'<.*?>', '', name) 311 name = re.sub(r'^ *', '', name) 312 #if re.search(title, name, re.I): 313 if len(name) > 0: 314 matchList.append((match[0], name)) 315 316 if not matchList: return None 317 return matchList 318 319 def __fetchMovieInfo(self, url, url2): 320 """ 321 Looks for movie information 322 """ 323 self.__getHTMLContent(url) 324 matches = data = {} 325 326 for name, regexp in self.__regExps.iteritems(): 327 matches[name] = re.search(regexp, self.__data, re.S | re.I) 328 329 if matches[name]: 330 if name == 'title': 331 data[name] = matches[name].group('title').strip() 332 elif name == 'dirs': 333 dirsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',') 334 data[name] = [] 335 for d in dirsList: 336 data[name].append(d.strip()) 337 338 elif name == 'nat': 339 natList = re.findall(r'<span class=".*?">(.*?)</span>', matches[name].group('nat'), re.DOTALL) 340 data[name] = [] 341 for d in natList: 342 data[name].append(d.strip().capitalize()) 343 344 elif name == 'genres': 345 genresList = re.findall(r'<span itemprop="genre">(.*?)</span>', matches[name].group('step1'), re.DOTALL) 346 data[name] = [] 347 for d in genresList: 348 data[name].append(d.strip().capitalize()) 349 350 elif name == 'studio': 351 studiosList = re.findall(r'<span itemprop="productionCompany">(.*?)</span>', matches[name].group('step1')) 352 data[name] = [] 353 for d in studiosList: 354 data[name].append(d.strip()) 355 356 elif name == 'time': 357 h, m = matches[name].group('hours'), matches[name].group('mins') 358 if len(m) == 0: 359 m = 0 360 totmin = int(h)*60+int(m) 361 data[name] = str(totmin) 362 363 elif name == 'year': 364 data[name] = matches[name].group('year').strip() 365 366 elif name == 'otitle': 367 otitle = re.sub(r'([\r\n]+|<em>|</em>)', '', matches[name].group('otitle')) 368 data[name] = otitle.strip() 369 370 elif name == 'plot': 371 data[name] = matches[name].group('plot').strip() 372 # Cleans up any HTML entities 373 data[name] = self.__cleanUp(data[name]) 374 375 else: 376 matches[name] = '' 377 378 # Image check 379 try: 380 imgtmp = re.findall(self.__regExps['image'], self.__data, re.S | re.I) 381 matches['image'] = imgtmp[0] 382 383 # Save image to a temporary folder 384 md5 = genMD5() 385 imObj = urlopen(matches['image'].strip()) 386 img = imObj.read() 387 imObj.close() 388 imgPath = "/tmp/%s.jpeg" % md5 389 f = open(imgPath, 'w') 390 f.write(img) 391 f.close() 392 393 # Base64 encoding 394 data['image'] = (md5 + '.jpeg', base64.encodestring(img)) 395 396 # Delete temporary image 397 os.remove(imgPath) 398 except: 399 data['image'] = None 400 401 # Now looks for casting information 402 self.__getHTMLContent(url2) 403 page = self.__data.split('\n') 404 405 d = zone = 0 406 data['actors'] = [] 407 data['prods'] = [] 408 data['scens'] = [] 409 data['comps'] = [] 410 411 # Actors 412 subset = re.search(r'Acteurs et actrices.*$', self.__data, re.S | re.I) 413 if not subset: return data 414 subset = subset.group(0) 415 #print subset 416 roleactor = re.findall(self.__castRegExps['roleactor'], subset, re.S | re.I) 417 for ra in roleactor: 418 #print ra 419 data['actors'].append(re.sub(r'([\r\n\t]+)', '', ra[0])) 420 data['actors'].append(re.sub(r'([\r\n\t]+)', '', ra[1])) 421 422 # Producers, Scenarists, Composers 423 for kind in ('prods', 'scens', 'comps'): 424 data[kind] = [re.sub(r'([\r\n\t]+)', '', k).strip() for k in re.findall(self.__castRegExps[kind], subset, re.S | re.I)] 425 426 return data 427 428 def __cleanUp(self, data): 429 """ 430 Cleans up the string(s), replacing raw HTML entities with their 431 ISO Latin-1 replacement text. 432 @param data string or list of strings 433 """ 434 if type(data) == types.ListType: 435 for s in data: 436 for k, v in htmlents.entitydefs.iteritems(): 437 s = s.replace("&%s;" % k, v) 438 elif type(data) == types.StringType or type(data) == types.UnicodeType: 439 for k, v in htmlents.entitydefs.iteritems(): 440 data = data.replace("&%s;" % k, v) 441 return data 442 443 def __getMovie(self, title): 444 if not len(title): return 445 446 self.__title = title 447 self.__getHTMLContent(self.__searchURL % urllib.quote(self.__title)) 448 449 # Get all links 450 links = self.__fetchMovieLinks(title) 451 452 # Now retrieve info 453 if links: 454 for entry in links: 455 data = self.__fetchMovieInfo( url = "%s=%s" % (self.__movieURL, entry[0]), url2 = "%s=%s" % (self.__castURL, entry[0]) ) 456 # Add allocine link (custom field) 457 data['allocine'] = "%s=%s" % (self.__movieURL, entry[0]) 458 self.__domTree.addEntry(data) 459 else: 460 return None 461 462 463def showUsage(): 464 print("Usage: %s movietitle" % sys.argv[0]) 465 sys.exit(1) 466 467def main(): 468 if len(sys.argv) < 2: 469 showUsage() 470 471 parser = AlloCineParser() 472 parser.run(sys.argv[1]) 473 474if __name__ == '__main__': 475 main() 476