1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3
4
5__license__   = 'GPL v3'
6__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
7__docformat__ = 'restructuredtext en'
8
9"""
10Provides abstraction for metadata reading.writing from a variety of ebook formats.
11"""
12import os, sys, re
13from contextlib import suppress
14
15from calibre import relpath, guess_type, prints, force_unicode
16from calibre.utils.config_base import tweaks
17from polyglot.builtins import codepoint_to_chr, iteritems, as_unicode
18from polyglot.urllib import quote, unquote, urlparse
19
20
21try:
22    _author_pat = re.compile(tweaks['authors_split_regex'])
23except Exception:
24    prints('Author split regexp:', tweaks['authors_split_regex'],
25            'is invalid, using default')
26    _author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
27
28
29def string_to_authors(raw):
30    if not raw:
31        return []
32    raw = raw.replace('&&', '\uffff')
33    raw = _author_pat.sub('&', raw)
34    authors = [a.strip().replace('\uffff', '&') for a in raw.split('&')]
35    return [a for a in authors if a]
36
37
38def authors_to_string(authors):
39    if authors is not None:
40        return ' & '.join([a.replace('&', '&&') for a in authors if a])
41    else:
42        return ''
43
44
45def remove_bracketed_text(src, brackets=None):
46    if brackets is None:
47        brackets = {'(': ')', '[': ']', '{': '}'}
48    from collections import Counter
49    counts = Counter()
50    total = 0
51    buf = []
52    src = force_unicode(src)
53    rmap = {v: k for k, v in iteritems(brackets)}
54    for char in src:
55        if char in brackets:
56            counts[char] += 1
57            total += 1
58        elif char in rmap:
59            idx = rmap[char]
60            if counts[idx] > 0:
61                counts[idx] -= 1
62                total -= 1
63        elif total < 1:
64            buf.append(char)
65    return ''.join(buf)
66
67
68def author_to_author_sort(
69        author,
70        method=None,
71        copywords=None,
72        use_surname_prefixes=None,
73        surname_prefixes=None,
74        name_prefixes=None,
75        name_suffixes=None
76):
77    if not author:
78        return ''
79
80    if method is None:
81        method = tweaks['author_sort_copy_method']
82    if method == 'copy':
83        return author
84
85    sauthor = remove_bracketed_text(author).strip()
86    if method == 'comma' and ',' in sauthor:
87        return author
88
89    tokens = sauthor.split()
90    if len(tokens) < 2:
91        return author
92
93    ltoks = frozenset(x.lower() for x in tokens)
94    copy_words = frozenset(x.lower() for x in (tweaks['author_name_copywords'] if copywords is None else copywords))
95    if ltoks.intersection(copy_words):
96        return author
97
98    author_use_surname_prefixes = tweaks['author_use_surname_prefixes'] if use_surname_prefixes is None else use_surname_prefixes
99    if author_use_surname_prefixes:
100        author_surname_prefixes = frozenset(x.lower() for x in (tweaks['author_surname_prefixes'] if surname_prefixes is None else surname_prefixes))
101        if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes:
102            return author
103
104    prefixes = {force_unicode(y).lower() for y in (tweaks['author_name_prefixes'] if name_prefixes is None else name_prefixes)}
105    prefixes |= {y+'.' for y in prefixes}
106
107    for first in range(len(tokens)):
108        if tokens[first].lower() not in prefixes:
109            break
110    else:
111        return author
112
113    suffixes = {force_unicode(y).lower() for y in (tweaks['author_name_suffixes'] if name_suffixes is None else name_suffixes)}
114    suffixes |= {y+'.' for y in suffixes}
115
116    for last in range(len(tokens) - 1, first - 1, -1):
117        if tokens[last].lower() not in suffixes:
118            break
119    else:
120        return author
121
122    suffix = ' '.join(tokens[last + 1:])
123
124    if author_use_surname_prefixes:
125        if last > first and tokens[last - 1].lower() in author_surname_prefixes:
126            tokens[last - 1] += ' ' + tokens[last]
127            last -= 1
128
129    atokens = tokens[last:last + 1] + tokens[first:last]
130    num_toks = len(atokens)
131    if suffix:
132        atokens.append(suffix)
133
134    if method != 'nocomma' and num_toks > 1:
135        atokens[0] += ','
136
137    return ' '.join(atokens)
138
139
140def authors_to_sort_string(authors):
141    return ' & '.join(map(author_to_author_sort, authors))
142
143
144_title_pats = {}
145
146
147def get_title_sort_pat(lang=None):
148    ans = _title_pats.get(lang, None)
149    if ans is not None:
150        return ans
151    q = lang
152    from calibre.utils.localization import canonicalize_lang, get_lang
153    if lang is None:
154        q = tweaks['default_language_for_title_sort']
155        if q is None:
156            q = get_lang()
157    q = canonicalize_lang(q) if q else q
158    data = tweaks['per_language_title_sort_articles']
159    try:
160        ans = data.get(q, None)
161    except AttributeError:
162        ans = None  # invalid tweak value
163    try:
164        ans = frozenset(ans) if ans else frozenset(data['eng'])
165    except:
166        ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
167    ans = '|'.join(ans)
168    ans = '^(%s)'%ans
169    try:
170        ans = re.compile(ans, re.IGNORECASE)
171    except:
172        ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
173    _title_pats[lang] = ans
174    return ans
175
176
177_ignore_starts = '\'"'+''.join(codepoint_to_chr(x) for x in
178        list(range(0x2018, 0x201e))+[0x2032, 0x2033])
179
180
181def title_sort(title, order=None, lang=None):
182    if order is None:
183        order = tweaks['title_series_sorting']
184    title = title.strip()
185    if order == 'strictly_alphabetic':
186        return title
187    if title and title[0] in _ignore_starts:
188        title = title[1:]
189    match = get_title_sort_pat(lang).search(title)
190    if match:
191        try:
192            prep = match.group(1)
193        except IndexError:
194            pass
195        else:
196            title = title[len(prep):] + ', ' + prep
197            if title[0] in _ignore_starts:
198                title = title[1:]
199    return title.strip()
200
201
202coding = list(zip(
203[1000,900,500,400,100,90,50,40,10,9,5,4,1],
204["M","CM","D","CD","C","XC","L","XL","X","IX","V","IV","I"]
205))
206
207
208def roman(num):
209    if num <= 0 or num >= 4000 or int(num) != num:
210        return str(num)
211    result = []
212    for d, r in coding:
213        while num >= d:
214            result.append(r)
215            num -= d
216    return ''.join(result)
217
218
219def fmt_sidx(i, fmt='%.2f', use_roman=False):
220    if i is None or i == '':
221        i = 1
222    try:
223        i = float(i)
224    except Exception:
225        return str(i)
226    if int(i) == float(i):
227        return roman(int(i)) if use_roman else '%d'%int(i)
228    return fmt%i
229
230
231class Resource:
232
233    '''
234    Represents a resource (usually a file on the filesystem or a URL pointing
235    to the web. Such resources are commonly referred to in OPF files.
236
237    They have the interface:
238
239    :member:`path`
240    :member:`mime_type`
241    :method:`href`
242
243    '''
244
245    def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True):
246        self._href = None
247        self._basedir = basedir
248        self.path = None
249        self.fragment = ''
250        try:
251            self.mime_type = guess_type(href_or_path)[0]
252        except:
253            self.mime_type = None
254        if self.mime_type is None:
255            self.mime_type = 'application/octet-stream'
256        if is_path:
257            path = href_or_path
258            if not os.path.isabs(path):
259                path = os.path.abspath(os.path.join(basedir, path))
260            if isinstance(path, bytes):
261                path = path.decode(sys.getfilesystemencoding())
262            self.path = path
263        else:
264            url = urlparse(href_or_path)
265            if url[0] not in ('', 'file'):
266                self._href = href_or_path
267            else:
268                pc = url[2]
269                if isinstance(pc, str):
270                    pc = pc.encode('utf-8')
271                pc = unquote(pc).decode('utf-8')
272                self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
273                self.fragment = unquote(url[-1])
274
275    def href(self, basedir=None):
276        '''
277        Return a URL pointing to this resource. If it is a file on the filesystem
278        the URL is relative to `basedir`.
279
280        `basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`).
281        If this resource has no basedir, then the current working directory is used as the basedir.
282        '''
283        if basedir is None:
284            if self._basedir:
285                basedir = self._basedir
286            else:
287                basedir = os.getcwd()
288        if self.path is None:
289            return self._href
290        f = self.fragment.encode('utf-8') if isinstance(self.fragment, str) else self.fragment
291        frag = '#'+as_unicode(quote(f)) if self.fragment else ''
292        if self.path == basedir:
293            return ''+frag
294        try:
295            rpath = relpath(self.path, basedir)
296        except OSError:  # On windows path and basedir could be on different drives
297            rpath = self.path
298        if isinstance(rpath, str):
299            rpath = rpath.encode('utf-8')
300        return as_unicode(quote(rpath.replace(os.sep, '/')))+frag
301
302    def set_basedir(self, path):
303        self._basedir = path
304
305    def basedir(self):
306        return self._basedir
307
308    def __repr__(self):
309        return 'Resource(%s, %s)'%(repr(self.path), repr(self.href()))
310
311
312class ResourceCollection:
313
314    def __init__(self):
315        self._resources = []
316
317    def __iter__(self):
318        yield from self._resources
319
320    def __len__(self):
321        return len(self._resources)
322
323    def __getitem__(self, index):
324        return self._resources[index]
325
326    def __bool__(self):
327        return len(self._resources) > 0
328
329    def __str__(self):
330        resources = map(repr, self)
331        return '[%s]'%', '.join(resources)
332
333    def __repr__(self):
334        return str(self)
335
336    def append(self, resource):
337        if not isinstance(resource, Resource):
338            raise ValueError('Can only append objects of type Resource')
339        self._resources.append(resource)
340
341    def remove(self, resource):
342        self._resources.remove(resource)
343
344    def replace(self, start, end, items):
345        'Same as list[start:end] = items'
346        self._resources[start:end] = items
347
348    @staticmethod
349    def from_directory_contents(top, topdown=True):
350        collection = ResourceCollection()
351        for spec in os.walk(top, topdown=topdown):
352            path = os.path.abspath(os.path.join(spec[0], spec[1]))
353            res = Resource.from_path(path)
354            res.set_basedir(top)
355            collection.append(res)
356        return collection
357
358    def set_basedir(self, path):
359        for res in self:
360            res.set_basedir(path)
361
362
363def MetaInformation(title, authors=(_('Unknown'),)):
364    ''' Convenient encapsulation of book metadata, needed for compatibility
365        @param title: title or ``_('Unknown')`` or a MetaInformation object
366        @param authors: List of strings or []
367    '''
368    from calibre.ebooks.metadata.book.base import Metadata
369    mi = None
370    if hasattr(title, 'title') and hasattr(title, 'authors'):
371        mi = title
372        title = mi.title
373        authors = mi.authors
374    return Metadata(title, authors, other=mi)
375
376
377def check_digit_for_isbn10(isbn):
378    check = sum((i+1)*int(isbn[i]) for i in range(9)) % 11
379    return 'X' if check == 10 else str(check)
380
381
382def check_digit_for_isbn13(isbn):
383    check = 10 - sum((1 if i%2 ==0 else 3)*int(isbn[i]) for i in range(12)) % 10
384    if check == 10:
385        check = 0
386    return str(check)
387
388
389def check_isbn10(isbn):
390    with suppress(Exception):
391        return check_digit_for_isbn10(isbn) == isbn[9]
392    return False
393
394
395def check_isbn13(isbn):
396    with suppress(Exception):
397        return check_digit_for_isbn13(isbn) == isbn[12]
398    return False
399
400
401def check_isbn(isbn, simple_sanitize=False):
402    if not isbn:
403        return None
404    if simple_sanitize:
405        isbn = isbn.upper().replace('-', '').strip().replace(' ', '')
406    else:
407        isbn = re.sub(r'[^0-9X]', '', isbn.upper())
408    il = len(isbn)
409    if il not in (10, 13):
410        return None
411    all_same = re.match(r'(\d)\1{9,12}$', isbn)
412    if all_same is not None:
413        return None
414    if il == 10:
415        return isbn if check_isbn10(isbn) else None
416    if il == 13:
417        return isbn if check_isbn13(isbn) else None
418    return None
419
420
421def normalize_isbn(isbn):
422    if not isbn:
423        return isbn
424    ans = check_isbn(isbn)
425    if ans is None:
426        return isbn
427    if len(ans) == 10:
428        ans = '978' + ans[:9]
429        ans += check_digit_for_isbn13(ans)
430    return ans
431
432
433def check_issn(issn):
434    if not issn:
435        return None
436    issn = re.sub(r'[^0-9X]', '', issn.upper())
437    try:
438        digits = tuple(map(int, issn[:7]))
439        products = [(8 - i) * d for i, d in enumerate(digits)]
440        check = 11 - sum(products) % 11
441        if (check == 10 and issn[7] == 'X') or check == int(issn[7]):
442            return issn
443    except Exception:
444        pass
445    return None
446
447
448def format_isbn(isbn):
449    cisbn = check_isbn(isbn)
450    if not cisbn:
451        return isbn
452    i = cisbn
453    if len(i) == 10:
454        return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
455    return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
456
457
458def check_doi(doi):
459    'Check if something that looks like a DOI is present anywhere in the string'
460    if not doi:
461        return None
462    doi_check = re.search(r'10\.\d{4}/\S+', doi)
463    if doi_check is not None:
464        return doi_check.group()
465    return None
466
467
468def rating_to_stars(value, allow_half_stars=False, star='★', half='⯨'):
469    r = max(0, min(int(value or 0), 10))
470    ans = star * (r // 2)
471    if allow_half_stars and r % 2:
472        ans += half
473    return ans
474