1#!/usr/local/bin/python3.8
2# -*- coding: iso-8859-1 -*-
3# kate: replace-tabs off;
4# ***************************************************************************
5#    copyright            : (C) 2006-2010 by Mathias Monnerville
6#    email                : tellico@monnerville.com
7# ***************************************************************************
8#
9# ***************************************************************************
10# *                                                                         *
11# *   This program is free software; you can redistribute it and/or modify  *
12# *   it under the terms of version 2 of the GNU General Public License as  *
13# *   published by the Free Software Foundation;                            *
14# *                                                                         *
15# ***************************************************************************
16#
17# Version 0.7.3: 2010-12-07 (Reported by Romain Henriet)
18# * Fixed some regexp issues
19# * Better handling of image parsing/fetching errors
20#
21# Version 0.7.2.1: 2010-07-27 (Reported by Romain Henriet)
22# * Updated title match to allow searching without diacritical marks
23#
24# Version 0.7.2: 2010-05-27 (Reported by Romain Henriet)
25# * Fixed bug preventing searches with accent marks
26# * Added post-processing cleanup action to replace raw HTML entities with
27#   their ISO Latin-1 replacement text
28#
29# Version 0.7.1: 2010-04-26 (Thanks to Romain Henriet <romain-devel@laposte.net>)
30# * Fixed greedy regexp for genre.  Fixed nationality output. Add studio.
31#
32# Version 0.7: 2009-11-12
33# * Allocine has a brand new website. All regexps were broken.
34#
35# Version 0.6: 2009-03-04 (Thanks to R. Fischer and Henry-Nicolas Tourneur)
36# * Fixed parsing issues (various RegExp issues due to allocine's HTML changes)
37#
38# Version 0.5: 2009-01-21 (Changes contributed by R. Fischer <fischer.tellico@free.fr>)
39# * Added complete distribution of actors and roles, Genres, Nationalities, producers, composer and scenarist
40# * Fixed the plot field that returned a wrong answer when no plot is available
41# * Fixed a bug related to parameters encoding
42#
43# Version 0.4:
44# * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres
45# could not be retrieved. Fixed bad http request error due to some changes in HTML code.
46#
47# Version 0.3:
48# * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed.
49#
50# Version 0.2:
51# * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore.
52#
53# Version 0.1:
54# * Initial release.
55
56import sys, os, re, hashlib, random, types
57import urllib, time, base64
58import xml.dom.minidom
59import locale
60try:
61	import htmlentitydefs as htmlents
62except ImportError:
63	try:
64		from html.entities import entitydefs as htmlents
65	except ImportError:
66		print('Python 2.5+ required')
67		raise
68
69try:
70	# For Python 3.0 and later
71	from urllib.request import urlopen
72except ImportError:
73	# Fall back to Python 2's urllib2
74	from urllib2 import urlopen
75
76XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>"""
77DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">"""
78
79VERSION = "0.7.3"
80
81def genMD5():
82	float = random.random()
83	return hashlib.md5(str(float)).hexdigest()
84
85class BasicTellicoDOM:
86	def __init__(self):
87		self.__doc = xml.dom.minidom.Document()
88		self.__root = self.__doc.createElement('tellico')
89		self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/')
90		self.__root.setAttribute('syntaxVersion', '9')
91
92		self.__collection = self.__doc.createElement('collection')
93		self.__collection.setAttribute('title', 'My Movies')
94		self.__collection.setAttribute('type', '3')
95
96		self.__fields = self.__doc.createElement('fields')
97		# Add all default (standard) fields
98		self.__dfltField = self.__doc.createElement('field')
99		self.__dfltField.setAttribute('name', '_default')
100
101		# Add a custom 'Collection' field
102		self.__customField = self.__doc.createElement('field')
103		self.__customField.setAttribute('name', 'titre-original')
104		self.__customField.setAttribute('title', 'Original Title')
105		self.__customField.setAttribute('flags', '0')
106		self.__customField.setAttribute('category', unicode('G�n�ral', 'latin-1').encode('utf-8'))
107		self.__customField.setAttribute('format', '1')
108		self.__customField.setAttribute('type', '1')
109		self.__customField.setAttribute('i18n', 'yes')
110
111		self.__fields.appendChild(self.__dfltField)
112		self.__fields.appendChild(self.__customField)
113		self.__collection.appendChild(self.__fields)
114
115		self.__images = self.__doc.createElement('images')
116
117		self.__root.appendChild(self.__collection)
118		self.__doc.appendChild(self.__root)
119
120		# Current movie id
121		self.__currentId = 0
122
123
124	def addEntry(self, movieData):
125		"""
126		Add a movie entry
127		"""
128		d = movieData
129		entryNode = self.__doc.createElement('entry')
130		entryNode.setAttribute('id', str(self.__currentId))
131
132		titleNode = self.__doc.createElement('title')
133		titleNode.appendChild(self.__doc.createTextNode(d['title']))
134
135		otitleNode = self.__doc.createElement('titre-original')
136		otitleNode.appendChild(self.__doc.createTextNode(d['otitle']))
137
138		yearNode = self.__doc.createElement('year')
139		yearNode.appendChild(self.__doc.createTextNode(d['year']))
140
141		genresNode = self.__doc.createElement('genres')
142		for g in d['genres']:
143			genreNode = self.__doc.createElement('genre')
144			genreNode.appendChild(self.__doc.createTextNode(g))
145			genresNode.appendChild(genreNode)
146
147		studsNode = self.__doc.createElement('studios')
148		for g in d['studio']:
149			studNode = self.__doc.createElement('studio')
150			studNode.appendChild(self.__doc.createTextNode(g))
151			studsNode.appendChild(studNode)
152
153		natsNode = self.__doc.createElement('nationalitys')
154		for g in d['nat']:
155			natNode = self.__doc.createElement('nationality')
156			natNode.appendChild(self.__doc.createTextNode(g))
157			natsNode.appendChild(natNode)
158
159		castsNode = self.__doc.createElement('casts')
160		i = 0
161		while i < len(d['actors']):
162			g = d['actors'][i]
163			h = d['actors'][i+1]
164			castNode = self.__doc.createElement('cast')
165			col1Node = self.__doc.createElement('column')
166			col2Node = self.__doc.createElement('column')
167			col1Node.appendChild(self.__doc.createTextNode(g))
168			col2Node.appendChild(self.__doc.createTextNode(h))
169			castNode.appendChild(col1Node)
170			castNode.appendChild(col2Node)
171			castsNode.appendChild(castNode)
172			i = i + 2
173
174		dirsNode = self.__doc.createElement('directors')
175		for g in d['dirs']:
176			dirNode = self.__doc.createElement('director')
177			dirNode.appendChild(self.__doc.createTextNode(g))
178			dirsNode.appendChild(dirNode)
179
180		prodsNode = self.__doc.createElement('producers')
181		for g in d['prods']:
182			prodNode = self.__doc.createElement('producer')
183			prodNode.appendChild(self.__doc.createTextNode(g))
184			prodsNode.appendChild(prodNode)
185
186		scensNode = self.__doc.createElement('writers')
187		for g in d['scens']:
188			scenNode = self.__doc.createElement('writer')
189			scenNode.appendChild(self.__doc.createTextNode(g))
190			scensNode.appendChild(scenNode)
191
192		compsNode = self.__doc.createElement('composers')
193		for g in d['comps']:
194			compNode = self.__doc.createElement('composer')
195			compNode.appendChild(self.__doc.createTextNode(g))
196			compsNode.appendChild(compNode)
197
198		timeNode = self.__doc.createElement('running-time')
199		timeNode.appendChild(self.__doc.createTextNode(d['time']))
200
201		allocineNode = self.__doc.createElement(unicode('allocin�-link', 'latin-1').encode('utf-8'))
202		allocineNode.appendChild(self.__doc.createTextNode(d['allocine']))
203
204		plotNode = self.__doc.createElement('plot')
205		plotNode.appendChild(self.__doc.createTextNode(d['plot']))
206
207		if d['image']:
208			imageNode = self.__doc.createElement('image')
209			imageNode.setAttribute('format', 'JPEG')
210			imageNode.setAttribute('id', d['image'][0])
211			imageNode.setAttribute('width', '120')
212			imageNode.setAttribute('height', '160')
213			imageNode.appendChild(self.__doc.createTextNode(d['image'][1]))
214
215			coverNode = self.__doc.createElement('cover')
216			coverNode.appendChild(self.__doc.createTextNode(d['image'][0]))
217
218		for name in (	'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'studsNode', 'natsNode',
219						'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode',
220						'prodsNode', 'compsNode', 'scensNode' ):
221			entryNode.appendChild(eval(name))
222
223		if d['image']:
224			entryNode.appendChild(coverNode)
225			self.__images.appendChild(imageNode)
226
227		self.__collection.appendChild(entryNode)
228		self.__currentId += 1
229
230	def printXML(self):
231		"""
232		Outputs XML content to stdout
233		"""
234		self.__collection.appendChild(self.__images)
235		print(XML_HEADER);
236		print(DOCTYPE)
237		print(self.__root.toxml())
238
239
240class AlloCineParser:
241	def __init__(self):
242		self.__baseURL 	= 'http://www.allocine.fr'
243		self.__basePath = '/film/fichefilm_gen_cfilm'
244		self.__castPath = '/film/casting_gen_cfilm'
245		self.__searchURL= 'http://www.allocine.fr/recherche/?q=%s'
246		self.__movieURL = self.__baseURL + self.__basePath
247		self.__castURL = self.__baseURL + self.__castPath
248
249		# Define some regexps
250		self.__regExps = {
251			'title' 	: '<div id="title.*?<span.*?>(?P<title>.+?)</span>',
252			'dirs'		: """alis.*?par.*?<a.*?><span.*?>(?P<step1>.+?)</span></a>""",
253			'nat'		: 'Nationalit.*?</span>(?P<nat>.+?)</td',
254			'genres' 	: '<span class="lighten">.*?Genre.*?</span>(?P<step1>.+?)</td',
255			'studio' 	: 'Distributeur</div>(?P<step1>.+?)</td',
256			'time' 		: 'Dur.*?e *?:*?.*?(?P<hours>[0-9])h *(?P<mins>[0-9]*).*?Ann',
257			'year' 		: 'Ann.*?e de production.*?<span.*?>(?P<year>[0-9]{4})</span>',
258			'otitle' 	: 'Titre original *?:*?.*?<td>(?P<otitle>.+?)</td>',
259			'plot'		: '<p itemprop="description">(?P<plot>.*?)</p>',
260			'image'		: '<div class="poster">.*?<img src=\'(?P<image>http://.+?)\'.?',
261		}
262
263		self.__castRegExps = {
264#			'roleactor'		: '<li.*?itemprop="actors".*?>.*?<span itemprop="name">(.*?)</span>.*?<p>.*?R.*?le : (?P<role>.*?)</p>.*?</li>',
265			'roleactor'		: '<li.*?\/personne\/.*?">(.*?)</span>.*?<p.*?R.*?le : (?P<role>.*?)</p>.*?</li',
266			'prods'			  : '<td>[\r\n\t]*Producteur[\r\n\t]*</td>.*?<span.*?>(.*?)</span>',
267			'scens'			  : '<td>[\r\n\t]*Sc.*?nariste[\r\n\t]*</td>.*?<span.*?>(.*?)</span>',
268			'comps'			  : '<td>[\r\n\t]*Compositeur[\r\n\t]*</td>.*?<span.*?>(.*?)</span>',
269		}
270
271		self.__domTree = BasicTellicoDOM()
272
273	def run(self, title):
274		"""
275		Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree
276		to stdout (in tellico format) so that tellico can use it.
277		"""
278		# the script needs the search string to be encoded in utf-8
279		try:
280			# first try system encoding
281			title = unicode(title, sys.stdin.encoding or sys.getdefaultencoding())
282		except UnicodeDecodeError:
283			# on failure, fallback to 'latin-1'
284			title = unicode(title, 'latin-1')
285
286		# now encode for urllib
287		title = title.encode('utf-8')
288		self.__getMovie(title)
289		# Print results to stdout
290		self.__domTree.printXML()
291
292	def __getHTMLContent(self, url):
293		"""
294		Fetch HTML data from url
295		"""
296
297		u = urlopen(url)
298		self.__data = u.read()
299		u.close()
300
301	def __fetchMovieLinks(self, title):
302		"""
303		Retrieve all links related to movie
304		@param title Movie title
305		"""
306		tmp = re.findall("""<td.*?class=['"]totalwidth['"]>.*?<a *href=['"]%s=(?P<page>.*?\.html?)['"] *?>(?P<title>.*?)</a>""" % self.__basePath, self.__data, re.S | re.I)
307		matchList = []
308		for match in tmp:
309			name = re.sub(r'([\r\n]+|<b>|</b>)', '', match[1])
310			name = re.sub(r'<.*?>', '', name)
311			name = re.sub(r'^ *', '', name)
312			#if re.search(title, name, re.I):
313			if len(name) > 0:
314				matchList.append((match[0], name))
315
316		if not matchList: return None
317		return matchList
318
319	def __fetchMovieInfo(self, url, url2):
320		"""
321		Looks for movie information
322		"""
323		self.__getHTMLContent(url)
324		matches = data = {}
325
326		for name, regexp in self.__regExps.iteritems():
327			matches[name] = re.search(regexp, self.__data, re.S | re.I)
328
329			if matches[name]:
330				if name == 'title':
331					data[name] = matches[name].group('title').strip()
332				elif name == 'dirs':
333					dirsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
334					data[name] = []
335					for d in dirsList:
336						data[name].append(d.strip())
337
338				elif name == 'nat':
339					natList = re.findall(r'<span class=".*?">(.*?)</span>', matches[name].group('nat'), re.DOTALL)
340					data[name] = []
341					for d in natList:
342						data[name].append(d.strip().capitalize())
343
344				elif name == 'genres':
345					genresList = re.findall(r'<span itemprop="genre">(.*?)</span>', matches[name].group('step1'), re.DOTALL)
346					data[name] = []
347					for d in genresList:
348						data[name].append(d.strip().capitalize())
349
350				elif name == 'studio':
351					studiosList = re.findall(r'<span itemprop="productionCompany">(.*?)</span>', matches[name].group('step1'))
352					data[name] = []
353					for d in studiosList:
354						data[name].append(d.strip())
355
356				elif name == 'time':
357					h, m = matches[name].group('hours'), matches[name].group('mins')
358					if len(m) == 0:
359						m = 0
360					totmin = int(h)*60+int(m)
361					data[name] = str(totmin)
362
363				elif name == 'year':
364					data[name] = matches[name].group('year').strip()
365
366				elif name == 'otitle':
367					otitle = re.sub(r'([\r\n]+|<em>|</em>)', '', matches[name].group('otitle'))
368					data[name] = otitle.strip()
369
370				elif name == 'plot':
371					data[name] = matches[name].group('plot').strip()
372				# Cleans up any HTML entities
373				data[name] = self.__cleanUp(data[name])
374
375			else:
376				matches[name] = ''
377
378		# Image check
379		try:
380			imgtmp = re.findall(self.__regExps['image'], self.__data, re.S | re.I)
381			matches['image'] = imgtmp[0]
382
383			# Save image to a temporary folder
384			md5 = genMD5()
385			imObj = urlopen(matches['image'].strip())
386			img = imObj.read()
387			imObj.close()
388			imgPath = "/tmp/%s.jpeg" % md5
389			f = open(imgPath, 'w')
390			f.write(img)
391			f.close()
392
393			# Base64 encoding
394			data['image'] = (md5 + '.jpeg', base64.encodestring(img))
395
396			# Delete temporary image
397			os.remove(imgPath)
398		except:
399			data['image'] = None
400
401		# Now looks for casting information
402		self.__getHTMLContent(url2)
403		page = self.__data.split('\n')
404
405		d = zone = 0
406		data['actors'] = []
407		data['prods'] = []
408		data['scens'] = []
409		data['comps'] = []
410
411		# Actors
412		subset = re.search(r'Acteurs et actrices.*$', self.__data, re.S | re.I)
413		if not subset: return data
414		subset = subset.group(0)
415                #print subset
416		roleactor = re.findall(self.__castRegExps['roleactor'], subset, re.S | re.I)
417		for ra in roleactor:
418                        #print ra
419			data['actors'].append(re.sub(r'([\r\n\t]+)', '', ra[0]))
420			data['actors'].append(re.sub(r'([\r\n\t]+)', '', ra[1]))
421
422		# Producers, Scenarists, Composers
423		for kind in ('prods', 'scens', 'comps'):
424			data[kind] = [re.sub(r'([\r\n\t]+)', '', k).strip() for k in re.findall(self.__castRegExps[kind], subset, re.S | re.I)]
425
426		return data
427
428	def __cleanUp(self, data):
429		"""
430		Cleans up the string(s), replacing raw HTML entities with their
431		ISO Latin-1 replacement text.
432		@param data string or list of strings
433		"""
434		if type(data) == types.ListType:
435			for s in data:
436				for k, v in htmlents.entitydefs.iteritems():
437					s = s.replace("&%s;" % k, v)
438		elif type(data) == types.StringType or type(data) == types.UnicodeType:
439			for k, v in htmlents.entitydefs.iteritems():
440				data = data.replace("&%s;" % k, v)
441		return data
442
443	def __getMovie(self, title):
444		if not len(title): return
445
446		self.__title = title
447		self.__getHTMLContent(self.__searchURL % urllib.quote(self.__title))
448
449		# Get all links
450		links = self.__fetchMovieLinks(title)
451
452		# Now retrieve info
453		if links:
454			for entry in links:
455				data = self.__fetchMovieInfo( url = "%s=%s" % (self.__movieURL, entry[0]), url2 = "%s=%s" % (self.__castURL, entry[0]) )
456				# Add allocine link (custom field)
457				data['allocine'] = "%s=%s" % (self.__movieURL, entry[0])
458				self.__domTree.addEntry(data)
459		else:
460			return None
461
462
463def showUsage():
464	print("Usage: %s movietitle" % sys.argv[0])
465	sys.exit(1)
466
467def main():
468	if len(sys.argv) < 2:
469		showUsage()
470
471	parser = AlloCineParser()
472	parser.run(sys.argv[1])
473
474if __name__ == '__main__':
475	main()
476