1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' 7__docformat__ = 'restructuredtext en' 8 9""" 10Provides abstraction for metadata reading.writing from a variety of ebook formats. 11""" 12import os, sys, re 13from contextlib import suppress 14 15from calibre import relpath, guess_type, prints, force_unicode 16from calibre.utils.config_base import tweaks 17from polyglot.builtins import codepoint_to_chr, iteritems, as_unicode 18from polyglot.urllib import quote, unquote, urlparse 19 20 21try: 22 _author_pat = re.compile(tweaks['authors_split_regex']) 23except Exception: 24 prints('Author split regexp:', tweaks['authors_split_regex'], 25 'is invalid, using default') 26 _author_pat = re.compile(r'(?i),?\s+(and|with)\s+') 27 28 29def string_to_authors(raw): 30 if not raw: 31 return [] 32 raw = raw.replace('&&', '\uffff') 33 raw = _author_pat.sub('&', raw) 34 authors = [a.strip().replace('\uffff', '&') for a in raw.split('&')] 35 return [a for a in authors if a] 36 37 38def authors_to_string(authors): 39 if authors is not None: 40 return ' & '.join([a.replace('&', '&&') for a in authors if a]) 41 else: 42 return '' 43 44 45def remove_bracketed_text(src, brackets=None): 46 if brackets is None: 47 brackets = {'(': ')', '[': ']', '{': '}'} 48 from collections import Counter 49 counts = Counter() 50 total = 0 51 buf = [] 52 src = force_unicode(src) 53 rmap = {v: k for k, v in iteritems(brackets)} 54 for char in src: 55 if char in brackets: 56 counts[char] += 1 57 total += 1 58 elif char in rmap: 59 idx = rmap[char] 60 if counts[idx] > 0: 61 counts[idx] -= 1 62 total -= 1 63 elif total < 1: 64 buf.append(char) 65 return ''.join(buf) 66 67 68def author_to_author_sort( 69 author, 70 method=None, 71 copywords=None, 72 use_surname_prefixes=None, 73 surname_prefixes=None, 74 name_prefixes=None, 75 name_suffixes=None 76): 77 if not author: 78 return '' 79 80 if method is None: 81 method = tweaks['author_sort_copy_method'] 82 if method == 'copy': 83 return author 84 85 sauthor = remove_bracketed_text(author).strip() 86 if method == 'comma' and ',' in sauthor: 87 return author 88 89 tokens = sauthor.split() 90 if len(tokens) < 2: 91 return author 92 93 ltoks = frozenset(x.lower() for x in tokens) 94 copy_words = frozenset(x.lower() for x in (tweaks['author_name_copywords'] if copywords is None else copywords)) 95 if ltoks.intersection(copy_words): 96 return author 97 98 author_use_surname_prefixes = tweaks['author_use_surname_prefixes'] if use_surname_prefixes is None else use_surname_prefixes 99 if author_use_surname_prefixes: 100 author_surname_prefixes = frozenset(x.lower() for x in (tweaks['author_surname_prefixes'] if surname_prefixes is None else surname_prefixes)) 101 if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes: 102 return author 103 104 prefixes = {force_unicode(y).lower() for y in (tweaks['author_name_prefixes'] if name_prefixes is None else name_prefixes)} 105 prefixes |= {y+'.' for y in prefixes} 106 107 for first in range(len(tokens)): 108 if tokens[first].lower() not in prefixes: 109 break 110 else: 111 return author 112 113 suffixes = {force_unicode(y).lower() for y in (tweaks['author_name_suffixes'] if name_suffixes is None else name_suffixes)} 114 suffixes |= {y+'.' for y in suffixes} 115 116 for last in range(len(tokens) - 1, first - 1, -1): 117 if tokens[last].lower() not in suffixes: 118 break 119 else: 120 return author 121 122 suffix = ' '.join(tokens[last + 1:]) 123 124 if author_use_surname_prefixes: 125 if last > first and tokens[last - 1].lower() in author_surname_prefixes: 126 tokens[last - 1] += ' ' + tokens[last] 127 last -= 1 128 129 atokens = tokens[last:last + 1] + tokens[first:last] 130 num_toks = len(atokens) 131 if suffix: 132 atokens.append(suffix) 133 134 if method != 'nocomma' and num_toks > 1: 135 atokens[0] += ',' 136 137 return ' '.join(atokens) 138 139 140def authors_to_sort_string(authors): 141 return ' & '.join(map(author_to_author_sort, authors)) 142 143 144_title_pats = {} 145 146 147def get_title_sort_pat(lang=None): 148 ans = _title_pats.get(lang, None) 149 if ans is not None: 150 return ans 151 q = lang 152 from calibre.utils.localization import canonicalize_lang, get_lang 153 if lang is None: 154 q = tweaks['default_language_for_title_sort'] 155 if q is None: 156 q = get_lang() 157 q = canonicalize_lang(q) if q else q 158 data = tweaks['per_language_title_sort_articles'] 159 try: 160 ans = data.get(q, None) 161 except AttributeError: 162 ans = None # invalid tweak value 163 try: 164 ans = frozenset(ans) if ans else frozenset(data['eng']) 165 except: 166 ans = frozenset((r'A\s+', r'The\s+', r'An\s+')) 167 ans = '|'.join(ans) 168 ans = '^(%s)'%ans 169 try: 170 ans = re.compile(ans, re.IGNORECASE) 171 except: 172 ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE) 173 _title_pats[lang] = ans 174 return ans 175 176 177_ignore_starts = '\'"'+''.join(codepoint_to_chr(x) for x in 178 list(range(0x2018, 0x201e))+[0x2032, 0x2033]) 179 180 181def title_sort(title, order=None, lang=None): 182 if order is None: 183 order = tweaks['title_series_sorting'] 184 title = title.strip() 185 if order == 'strictly_alphabetic': 186 return title 187 if title and title[0] in _ignore_starts: 188 title = title[1:] 189 match = get_title_sort_pat(lang).search(title) 190 if match: 191 try: 192 prep = match.group(1) 193 except IndexError: 194 pass 195 else: 196 title = title[len(prep):] + ', ' + prep 197 if title[0] in _ignore_starts: 198 title = title[1:] 199 return title.strip() 200 201 202coding = list(zip( 203[1000,900,500,400,100,90,50,40,10,9,5,4,1], 204["M","CM","D","CD","C","XC","L","XL","X","IX","V","IV","I"] 205)) 206 207 208def roman(num): 209 if num <= 0 or num >= 4000 or int(num) != num: 210 return str(num) 211 result = [] 212 for d, r in coding: 213 while num >= d: 214 result.append(r) 215 num -= d 216 return ''.join(result) 217 218 219def fmt_sidx(i, fmt='%.2f', use_roman=False): 220 if i is None or i == '': 221 i = 1 222 try: 223 i = float(i) 224 except Exception: 225 return str(i) 226 if int(i) == float(i): 227 return roman(int(i)) if use_roman else '%d'%int(i) 228 return fmt%i 229 230 231class Resource: 232 233 ''' 234 Represents a resource (usually a file on the filesystem or a URL pointing 235 to the web. Such resources are commonly referred to in OPF files. 236 237 They have the interface: 238 239 :member:`path` 240 :member:`mime_type` 241 :method:`href` 242 243 ''' 244 245 def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True): 246 self._href = None 247 self._basedir = basedir 248 self.path = None 249 self.fragment = '' 250 try: 251 self.mime_type = guess_type(href_or_path)[0] 252 except: 253 self.mime_type = None 254 if self.mime_type is None: 255 self.mime_type = 'application/octet-stream' 256 if is_path: 257 path = href_or_path 258 if not os.path.isabs(path): 259 path = os.path.abspath(os.path.join(basedir, path)) 260 if isinstance(path, bytes): 261 path = path.decode(sys.getfilesystemencoding()) 262 self.path = path 263 else: 264 url = urlparse(href_or_path) 265 if url[0] not in ('', 'file'): 266 self._href = href_or_path 267 else: 268 pc = url[2] 269 if isinstance(pc, str): 270 pc = pc.encode('utf-8') 271 pc = unquote(pc).decode('utf-8') 272 self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) 273 self.fragment = unquote(url[-1]) 274 275 def href(self, basedir=None): 276 ''' 277 Return a URL pointing to this resource. If it is a file on the filesystem 278 the URL is relative to `basedir`. 279 280 `basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`). 281 If this resource has no basedir, then the current working directory is used as the basedir. 282 ''' 283 if basedir is None: 284 if self._basedir: 285 basedir = self._basedir 286 else: 287 basedir = os.getcwd() 288 if self.path is None: 289 return self._href 290 f = self.fragment.encode('utf-8') if isinstance(self.fragment, str) else self.fragment 291 frag = '#'+as_unicode(quote(f)) if self.fragment else '' 292 if self.path == basedir: 293 return ''+frag 294 try: 295 rpath = relpath(self.path, basedir) 296 except OSError: # On windows path and basedir could be on different drives 297 rpath = self.path 298 if isinstance(rpath, str): 299 rpath = rpath.encode('utf-8') 300 return as_unicode(quote(rpath.replace(os.sep, '/')))+frag 301 302 def set_basedir(self, path): 303 self._basedir = path 304 305 def basedir(self): 306 return self._basedir 307 308 def __repr__(self): 309 return 'Resource(%s, %s)'%(repr(self.path), repr(self.href())) 310 311 312class ResourceCollection: 313 314 def __init__(self): 315 self._resources = [] 316 317 def __iter__(self): 318 yield from self._resources 319 320 def __len__(self): 321 return len(self._resources) 322 323 def __getitem__(self, index): 324 return self._resources[index] 325 326 def __bool__(self): 327 return len(self._resources) > 0 328 329 def __str__(self): 330 resources = map(repr, self) 331 return '[%s]'%', '.join(resources) 332 333 def __repr__(self): 334 return str(self) 335 336 def append(self, resource): 337 if not isinstance(resource, Resource): 338 raise ValueError('Can only append objects of type Resource') 339 self._resources.append(resource) 340 341 def remove(self, resource): 342 self._resources.remove(resource) 343 344 def replace(self, start, end, items): 345 'Same as list[start:end] = items' 346 self._resources[start:end] = items 347 348 @staticmethod 349 def from_directory_contents(top, topdown=True): 350 collection = ResourceCollection() 351 for spec in os.walk(top, topdown=topdown): 352 path = os.path.abspath(os.path.join(spec[0], spec[1])) 353 res = Resource.from_path(path) 354 res.set_basedir(top) 355 collection.append(res) 356 return collection 357 358 def set_basedir(self, path): 359 for res in self: 360 res.set_basedir(path) 361 362 363def MetaInformation(title, authors=(_('Unknown'),)): 364 ''' Convenient encapsulation of book metadata, needed for compatibility 365 @param title: title or ``_('Unknown')`` or a MetaInformation object 366 @param authors: List of strings or [] 367 ''' 368 from calibre.ebooks.metadata.book.base import Metadata 369 mi = None 370 if hasattr(title, 'title') and hasattr(title, 'authors'): 371 mi = title 372 title = mi.title 373 authors = mi.authors 374 return Metadata(title, authors, other=mi) 375 376 377def check_digit_for_isbn10(isbn): 378 check = sum((i+1)*int(isbn[i]) for i in range(9)) % 11 379 return 'X' if check == 10 else str(check) 380 381 382def check_digit_for_isbn13(isbn): 383 check = 10 - sum((1 if i%2 ==0 else 3)*int(isbn[i]) for i in range(12)) % 10 384 if check == 10: 385 check = 0 386 return str(check) 387 388 389def check_isbn10(isbn): 390 with suppress(Exception): 391 return check_digit_for_isbn10(isbn) == isbn[9] 392 return False 393 394 395def check_isbn13(isbn): 396 with suppress(Exception): 397 return check_digit_for_isbn13(isbn) == isbn[12] 398 return False 399 400 401def check_isbn(isbn, simple_sanitize=False): 402 if not isbn: 403 return None 404 if simple_sanitize: 405 isbn = isbn.upper().replace('-', '').strip().replace(' ', '') 406 else: 407 isbn = re.sub(r'[^0-9X]', '', isbn.upper()) 408 il = len(isbn) 409 if il not in (10, 13): 410 return None 411 all_same = re.match(r'(\d)\1{9,12}$', isbn) 412 if all_same is not None: 413 return None 414 if il == 10: 415 return isbn if check_isbn10(isbn) else None 416 if il == 13: 417 return isbn if check_isbn13(isbn) else None 418 return None 419 420 421def normalize_isbn(isbn): 422 if not isbn: 423 return isbn 424 ans = check_isbn(isbn) 425 if ans is None: 426 return isbn 427 if len(ans) == 10: 428 ans = '978' + ans[:9] 429 ans += check_digit_for_isbn13(ans) 430 return ans 431 432 433def check_issn(issn): 434 if not issn: 435 return None 436 issn = re.sub(r'[^0-9X]', '', issn.upper()) 437 try: 438 digits = tuple(map(int, issn[:7])) 439 products = [(8 - i) * d for i, d in enumerate(digits)] 440 check = 11 - sum(products) % 11 441 if (check == 10 and issn[7] == 'X') or check == int(issn[7]): 442 return issn 443 except Exception: 444 pass 445 return None 446 447 448def format_isbn(isbn): 449 cisbn = check_isbn(isbn) 450 if not cisbn: 451 return isbn 452 i = cisbn 453 if len(i) == 10: 454 return '-'.join((i[:2], i[2:6], i[6:9], i[9])) 455 return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12])) 456 457 458def check_doi(doi): 459 'Check if something that looks like a DOI is present anywhere in the string' 460 if not doi: 461 return None 462 doi_check = re.search(r'10\.\d{4}/\S+', doi) 463 if doi_check is not None: 464 return doi_check.group() 465 return None 466 467 468def rating_to_stars(value, allow_half_stars=False, star='★', half='⯨'): 469 r = max(0, min(int(value or 0), 10)) 470 ans = star * (r // 2) 471 if allow_half_stars and r % 2: 472 ans += half 473 return ans 474