2# Copyright 2008-2015 Jaap Karssenberg <jaap.karssenberg@gmail.com>
5import re
6import logging
7import itertools
9logger = logging.getLogger('zim.notebook')
12from zim.parsing import link_type
13from zim.errors import Error
15import zim.formats
16import zim.fs
17import zim.newfs
19from zim.signals import SignalEmitter, SIGNAL_NORMAL
21import zim.datetimetz as datetime
24_pagename_reduce_colon_re = re.compile('::+')
25_pagename_invalid_char_re = re.compile(
26	'(' +
27		'^[_\W]+|(?<=:)[_\W]+' +
28	'|' +
29		'[' + re.escape(''.join(
30			("?", "#", "/", "\\", "*", '"', "<", ">", "|", "%", "\t", "\n", "\r")
31		)) + ']' +
32	')',
34	# This pattern matches a non-alphanumber at start or after the ':'
35	# separator. It also matches any invalid character.
36	# The UNICODE flag is used to make the alphanumber check international.
39def shortest_unique_names(paths):
40	'''Returns the shortest unique name for each path in paths
41	@param paths: list of L{Path} objects
42	@returns: list of strings
43	'''
44	by_basename = {}
45	for path in paths:
46		basename = path.basename
47		mylist = by_basename.setdefault(basename, [])
48		mylist.append(path)
50	result = []
51	for path in paths:
52		basename = path.basename
53		conflicts = by_basename[basename]
54		if len(conflicts) == 1:
55			result.append(path.basename)
56		else:
57			conflicts.remove(path)
58			conflicts.insert(0, path) # shuffle path of interest to front
59			reverse_paths = [reversed(p.name.split(':')) for p in conflicts]
60			names = []
61			for parts in itertools.zip_longest(*reverse_paths):
62				if parts[0] is None:
63					break
64				elif parts[0] not in parts[1:]:
65					names.append(parts[0])
66					break
67				else:
68					names.append(parts[0])
70			result.append(':'.join(reversed(names)))
72	return result
75class Path(object):
76	'''Class representing a page name in the notebook
78	This is the parent class for the Page class. It contains the name
79	of the page and is used instead of the actual page object by methods
80	that only need to know the name of the page. Path objects have no
81	internal state and are essentially normalized page names. It also
82	has a number of methods to compare page names and determining what
83	the parent pages are etc.
85	@ivar name: the full name of the path
86	@ivar parts: all the parts of the name (split on ":")
87	@ivar basename: the basename of the path (last part of the name)
88	@ivar namespace: the name for the parent page or empty string
89	@ivar isroot: C{True} when this Path represents the top level namespace
90	@ivar parent: the L{Path} object for the parent page
93	Valid characters in page names
94	==============================
96	A number of characters are not valid in page names as used in Zim
97	notebooks.
99	Reserved characters are:
100	  - The ':' is reserved as separator
101	  - The '?' is reserved to encode url style options
102	  - The '#' is reserved as anchor separator
103	  - The '/' and '\' are reserved to distinguish file links & urls
104	  - First character of each part MUST be alphanumeric
105		(including utf8 letters / numbers)
107	For file system filenames we can not use:
108	'\', '/', ':', '*', '?', '"', '<', '>', '|'
109	(checked both win32 & posix)
111	Do not allow '\n' and '\t' for obvious reasons
113	Allowing '%' will cause problems with sql wildcards sooner
114	or later - also for url decoding ambiguity it is better to
115	keep this one reserved.
117	All other characters are allowed in page names
119	Note that Zim version < 0.42 used different rules that are not
120	fully compatible, this is important when upgrading old notebooks.
121	See L{Notebook.cleanup_pathname_zim028()}
122	'''
124	__slots__ = ('name',)
126	@staticmethod
127	def assertValidPageName(name):
128		'''Raises an C{AssertionError} if C{name} does not represent
129		a valid page name.
130		This is a strict check, most names that fail this test can still
131		be cleaned up by the L{makeValidPageName()}.
132		@param name: a string
133		@raises AssertionError: if the name is not valid
134		'''
135		assert isinstance(name, str)
136		if not name.strip(':') \
137		or _pagename_reduce_colon_re.search(name) \
138		or _pagename_invalid_char_re.search(name):
139			raise AssertionError('Not a valid page name: %s' % name)
141	@staticmethod
142	def makeValidPageName(name):
143		'''Remove any invalid characters from the string and return
144		a valid page name. Only string that can not be turned in
145		somthing valid is a string that reduces to an empty string
146		after removing all invalid characters.
147		@param name: a string
148		@returns: a string
149		@raises ValueError: when the result would be an empty string
150		'''
151		newname = _pagename_reduce_colon_re.sub(':', name.strip(':'))
152		newname = _pagename_invalid_char_re.sub('', newname)
153		newname = newname.replace('_', ' ')
154		try:
155			Path.assertValidPageName(newname)
156		except AssertionError:
157			raise ValueError('Not a valid page name: %s (was: %s)' % (newname, name))
158		return newname
160	def __init__(self, name):
161		'''Constructor.
163		@param name: the absolute page name in the right case as a
164		string or as a tuple strings
166		The name ":" is used as a special case to construct a path for
167		the toplevel namespace in a notebook.
169		@note: This constructor does not do any checks for the sanity of
170		the path name. Never construct a path directly from user input,
171		but use either L{index.lookup_from_user_input()} or first check the
172		name with L{makeValidPageName()}
173		'''
174		if isinstance(name, (list, tuple)):
175			self.name = ':'.join(name)
176		else:
177			self.name = name.strip(':')
179		try:
180			self.name = str(self.name)
181		except UnicodeDecodeError:
182			raise ValueError('BUG: invalid input, page names should be in ascii, or given as unicode')
184	@classmethod
185	def new_from_zim_config(klass, string):
186		'''Returns a new object based on the string representation for
187		that path.
188		'''
189		return klass(klass.makeValidPageName(string))
191	def serialize_zim_config(self):
192		'''Returns the name for serializing this path'''
193		return self.name
195	def __repr__(self):
196		return '<%s: %s>' % (self.__class__.__name__, self.name)
198	def __str__(self):
199		return self.name
201	def __hash__(self):
202		return self.name.__hash__()
204	def __eq__(self, other):
205		'''Paths are equal when their names are the same'''
206		if isinstance(other, Path):
207			return self.name == other.name
208		else: # e.g. path == None
209			return False
211	def __ne__(self, other):
212		'''Paths are not equal when their names are not the same'''
213		return not self.__eq__(other)
215	def __add__(self, name):
216		'''C{path + name} is an alias for C{path.child(name)}'''
217		return self.child(name)
219	@property
220	def parts(self):
221		'''Get all the parts of the name (split on ":")'''
222		return self.name.split(':')
224	@property
225	def basename(self):
226		'''Get the basename of the path (last part of the name)'''
227		i = self.name.rfind(':') + 1
228		return self.name[i:]
230	@property
231	def namespace(self):
232		'''Gives the name for the parent page.
233		Returns an empty string for the top level namespace.
234		'''
235		i = self.name.rfind(':')
236		if i > 0:
237			return self.name[:i]
238		else:
239			return ''
241	@property
242	def isroot(self):
243		'''C{True} when this Path represents the top level namespace'''
244		return self.name == ''
246	def relname(self, path): # TODO make this use HRef !
247		'''Get a part of this path relative to a parent path
249		@param path: a parent L{Path}
251		Raises an error if C{path} is not a parent
253		@returns: the part of the path that is relative to C{path}
254		'''
255		if path.name == '': # root path
256			return self.name
257		elif self.name.startswith(path.name + ':'):
258			i = len(path.name) + 1
259			return self.name[i:].strip(':')
260		else:
261			raise ValueError('"%s" is not below "%s"' % (self, path))
263	@property
264	def parent(self):
265		'''Get the path for the parent page'''
266		namespace = self.namespace
267		if namespace:
268			return Path(namespace)
269		elif self.isroot:
270			return None
271		else:
272			return Path(':')
274	def parents(self):
275		'''Generator function for parent Paths including root'''
276		if ':' in self.name:
277			path = self.name.split(':')
278			path.pop()
279			while len(path) > 0:
280				namespace = ':'.join(path)
281				yield Path(namespace)
282				path.pop()
283		yield Path(':')
285	def child(self, basename):
286		'''Get a child Path
288		@param basename: the relative name for the child
289		@returns: a new L{Path} object
290		'''
291		return Path(self.name + ':' + basename)
293	def ischild(self, parent):
294		'''Check whether this path is a child of a given path
295		@param parent: a L{Path} object
296		@returns: True when this path is a (grand-)child of C{parent}
297		'''
298		return parent.isroot or self.name.startswith(parent.name + ':')
300	def match_namespace(self, namespace):
301		'''Check whether this path is in a specific section of the notebook
302		@param namespace: a L{Path} object
303		@returns: True when this path is equal to C{namespace} or is a (grand-)child of C{namespace}
304		'''
305		return namespace.isroot or self.name == namespace.name or self.name.startswith(namespace.name + ':')
307	def commonparent(self, other):
308		'''Find a common parent for two Paths
310		@param other: another L{Path} object
312		@returns: a L{Path} object for the first common parent
313		'''
314		parent = []
315		parts = self.parts
316		other = other.parts
317		if parts[0] != other[0]:
318			return Path(':') # root
319		else:
320			for i in range(min(len(parts), len(other))):
321				if parts[i] == other[i]:
322					parent.append(parts[i])
323				else:
324					return Path(':'.join(parent))
325			else:
326				return Path(':'.join(parent))
333class HRef(object):
335	__slots__ = ('rel', 'names', 'anchor')
337	@classmethod
338	def new_from_wiki_link(klass, href):
339		'''Constructor that constructs a L{HRef} object for a link as
340		written in zim's wiki syntax.
341		@param href: a string for the link
342		@returns: a L{HRef} object
343		@raises ValueError: when the string could not be parsed
344		(see L{Path.makeValidPageName()})
346		@note: This method HRef class assumes the logic of our wiki links
347		for other formats, a separate constructor may be needed
348		'''
349		if href.startswith(':'):
351		elif href.startswith('+'):
353		else:
356		anchor = None
357		if '#' in href:
358			href, anchor = href.split('#', 1)
360		names = Path.makeValidPageName(href.lstrip('+')) if href else ""
362		return klass(rel, names, anchor)
364	def __init__(self, rel, names, anchor=None):
365		self.rel = rel
366		self.names = names
367		self.anchor = anchor
369	def __str__(self):
370		rel = {HREF_REL_ABSOLUTE: 'abs', HREF_REL_FLOATING: 'float', HREF_REL_RELATIVE: 'rel'}[self.rel]
371		return '<%s: %s %s %s>' % (self.__class__.__name__, rel, self.names, self.anchor)
373	def parts(self):
374		return self.names.split(':') if self.names else []
376	def to_wiki_link(self):
377		'''Returns href as text for wiki link'''
378		if self.rel == HREF_REL_ABSOLUTE:
379			link = ":" + self.names.strip(':')
380		elif self.rel == HREF_REL_RELATIVE:
381			link = "+" + self.names
382		else:
383			link = self.names
384		if self.anchor:
385			link += "#" + self.anchor
386		return link
389class SourceFile(zim.fs.File):
391	def iswritable(self):
392		return False
394	def write(self, *a):
395		raise AssertionError('Not writeable')
397	def writelines(self, *a):
398		raise AssertionError('Not writeable')
401class PageReadOnlyError(Error):
402	_msg = _('Can not modify page: %s') # T: error message for read-only pages
405class Page(Path, SignalEmitter):
406	'''Class to represent a single page in the notebook.
408	Page objects inherit from L{Path} but have internal state reflecting
409	content in the notebook. We try to keep Page objects unique
410	by hashing them in L{Notebook.get_page()}, Path object on the other
411	hand are cheap and can have multiple instances for the same logical path.
412	We ask for a path object instead of a name in the constructor to
413	encourage the use of Path objects over passing around page names as
414	string.
416	You can use a Page object instead of a Path anywhere in the APIs where
417	a path is needed as argument etc.
419	@ivar name: full page name (inherited from L{Path})
420	@ivar hascontent: C{True} if the page has content
421	@ivar haschildren: C{True} if the page has sub-pages
422	@ivar modified: C{True} if the page was modified since the last
423	store. Will be reset by L{Notebook.store_page()}
424	@ivar readonly: C{True} when the page is read-only or belongs to a readonly notebook
426	@signal: C{storage-changed (changed-on-disk)}: signal emitted on page
427	change. The argument "changed-on-disk" is C{True} when an external
428	edit was detected. For internal edits it is C{False}.
429	@signal: C{modified-changed ()}: emitted when the page is edited
430	'''
432	__signals__ = {
433		'storage-changed': (SIGNAL_NORMAL, None, (bool,)),
434		'modified-changed': (SIGNAL_NORMAL, None, ()),
435	}
437	def __init__(self, path, haschildren, file, folder, format):
438		assert isinstance(path, Path)
439		self.name = path.name
440		self.haschildren = haschildren
441			# Note: this attribute is updated by the owning notebook
442			# when a child page is stored
443		self._modified = False
444		self._change_counter = 0
445		self._parsetree = None
446		self._textbuffer = None
447		self._meta = None
449		self._readonly = None
450		self._last_etag = None
451		if isinstance(format, str):
452			self.format = zim.formats.get_format(format)
453		else:
454			self.format = format
455		self.source = SourceFile(file.path) # XXX
456		self.source_file = file
457		self.attachments_folder = folder
459	@property
460	def readonly(self):
461		if self._readonly is None:
462			self._readonly = not self.source_file.iswritable()
463		return self._readonly
465	@property
466	def mtime(self):
467		return self.source_file.mtime() if self.source_file.exists() else None
469	@property
470	def ctime(self):
471		return self.source_file.ctime() if self.source_file.exists() else None
473	@property
474	def hascontent(self):
475		'''Returns whether this page has content'''
476		if self._textbuffer:
477			return self._textbuffer.hascontent
478		elif self._parsetree:
479			return self._parsetree.hascontent
480		else:
481			return self.source_file.exists()
483	@property
484	def modified(self):
485		return self._modified
487	def set_modified(self, modified):
488		if modified:
489			# HACK: by setting page.modified to a number rather than a
490			# bool we can use this number to check against race conditions
491			# in notebook.store_page_async post handler
492			self._change_counter = max(1, (self._change_counter + 1) % 1000)
493			self._modified = self._change_counter
494			assert bool(self._modified) is True, 'BUG in counter'
495		else:
496			self._modified = False
497		self.emit('modified-changed')
499	def on_buffer_modified_changed(self, buffer):
500		# one-way traffic, set page modified after modifying the buffer
501		# but do not set page.modified False again when buffer goes
502		# back to un-modified. Reason is that we use the buffer modified
503		# state to track if we already requested the parse tree (see
504		# get_parsetree()) while page modified is used to track need
505		# for saving and is reset after save was done
506		if buffer.get_modified():
507			if self.readonly:
508				logger.warn('Buffer edited while page read-only - potential bug')
509			self.set_modified(True)
511	def _store(self):
512		tree = self.get_parsetree()
513		self._store_tree(tree)
515	def _store_tree(self, tree):
516		if tree and tree.hascontent:
517			if self._meta is not None:
518				tree.meta.update(self._meta) # Preserver headers
519			elif self.source_file.exists():
520				# Try getting headers from file
521				try:
522					text = self.source_file.read()
523				except zim.newfs.FileNotFoundError:
524					return None
525				else:
526					parser = self.format.Parser()
527					tree = parser.parse(text)
528					self._meta = tree.meta
529					tree.meta.update(self._meta) # Preserver headers
530			else: # not self.source_file.exists()
531				now = datetime.now()
532				tree.meta['Creation-Date'] = now.isoformat()
534			lines = self.format.Dumper().dump(tree, file_output=True)
535			self._last_etag = self.source_file.writelines_with_etag(lines, self._last_etag)
536			self._meta = tree.meta
537		else:
538			self.source_file.remove()
539			self._last_etag = None
540			self._meta = None
541		self.emit('storage-changed', False)
543	def check_source_changed(self):
544		'''Checks for changes in the source file and load it if needed
546		If the page has a C{textbuffer} and it contains unsaved changes, this
547		method will not overwrite them and you'll get an error on next attempt
548		to save. To force overwrite see L{reload_textbuffer()}
549		'''
550		if (
551			self._last_etag
552			and not (self.source_file.exists() and self.source_file.verify_etag(self._last_etag))
553		) or (
554			not self._last_etag
555			and self.source_file.exists()
556		):
557			logger.info('Page changed on disk: %s', self.name)
558			self._last_etag = None
559			self._meta = None
560			if self._textbuffer and not self._textbuffer.get_modified():
561				self.reload_textbuffer()
562			else:
563				self._parsetree = None
565			self.emit('storage-changed', True)
566			return True
567		else:
568			return False
570	def exists(self):
571		'''C{True} when the page has either content or children'''
572		return self.haschildren or self.hascontent
574	def isequal(self, other):
575		'''Check equality of pages
576		This method is intended to deal with case-insensitive storage
577		backends (e.g. case insensitive file system) where the method
578		is supposed to check equality of the resource.
579		Note that this may be the case even when the page objects differ
580		and can have a different name (so L{__cmp__} will not show
581		them to be equal). However default falls back to L{__cmp__}.
582		@returns: C{True} of both page objects point to the same resource
583		@implementation: can be implementated by subclasses
584		'''
585		if self is other or self == other:
586			return True
587		elif self.source_file.exists():
588			return self.source_file.isequal(other.source_file)
589		else:
590			return False
592	def get_parsetree(self):
593		'''Returns the contents of the page
595		@returns: a L{zim.formats.ParseTree} object or C{None}
596		'''
597		if self._textbuffer:
598			if self._textbuffer.get_modified() or self._parsetree is None:
599				self._parsetree = self._textbuffer.get_parsetree()
600				self._textbuffer.set_modified(False)
601			#~ print self._parsetree.tostring()
602			return self._parsetree
603		elif self._parsetree:
604			return self._parsetree
605		else:
606			try:
607				text, self._last_etag = self.source_file.read_with_etag()
608			except zim.newfs.FileNotFoundError:
609				return None
610			else:
611				parser = self.format.Parser()
612				self._parsetree = parser.parse(text, file_input=True)
613				self._meta = self._parsetree.meta
614				assert self._meta is not None
615				return self._parsetree
617	def set_parsetree(self, tree):
618		'''Set the parsetree with content for this page
620		@param tree: a L{zim.formats.ParseTree} object with content
621		or C{None} to remove all content from the page
623		@note: after setting new content in the Page object it still
624		needs to be stored in the notebook to save this content
625		permanently. See L{Notebook.store_page()}.
626		'''
627		if self.readonly:
628			raise PageReadOnlyError(self)
629		self._set_parsetree(tree)
631	def _set_parsetree(self, tree):
632		self._parsetree = tree
633		if self._textbuffer:
634			assert not self._textbuffer.get_modified(), 'BUG: changing parsetree while buffer was changed as well'
635			try:
636				if tree is None:
637					self._textbuffer.clear()
638				else:
639					self._textbuffer.set_parsetree(tree)
640			except:
641				# Prevent auto-save to kick in at any cost
642				self._textbuffer.set_modified(False)
643				raise
644			else:
645				self._textbuffer.set_modified(False)
647		self.set_modified(True)
649	def append_parsetree(self, tree):
650		'''Append content
652		@param tree: a L{zim.formats.ParseTree} object with content
653		'''
654		if self._textbuffer:
655			self._textbuffer.append_parsetree(tree)
656		else:
657			ourtree = self.get_parsetree()
658			if ourtree:
659				self.set_parsetree(ourtree + tree)
660			else:
661				self.set_parsetree(tree)
663	def get_textbuffer(self, constructor=None):
664		'''Get a C{Gtk.TextBuffer} for the page
666		Will either return an existing buffer or construct a new one and return
667		it. A C{Gtk.TextBuffer} can be shared between multiple C{Gtk.TextView}s.
668		The page object owns the textbuffer to allow multiple views on the same
669		page.
671		Once a buffer is set, also methods like L{get_parsetree()} and
672		L{get_parsetree()} will interact with this buffer.
674		@param constructor: if not buffer was set previously, this function
675		is called to construct the buffer.
677		@returns: a C{TextBuffer} object or C{None} if no buffer is set and
678		no constructor is provided.
679		'''
680		if self._textbuffer is None:
681			if constructor is None:
682				return None
684			tree = self.get_parsetree()
685			self._textbuffer = constructor(parsetree=tree)
686			self._textbuffer.connect('modified-changed', self.on_buffer_modified_changed)
688		return self._textbuffer
690	def reload_textbuffer(self):
691		'''Reload page content from source file and update the textbuffer if set
693			NOTE: this method overwrites any changes in the C{textbuffer} or
694			C{parsetree} that have not been saved to file !
695		'''
696		buffer = self._textbuffer
697		self._textbuffer = None
698		self._parsetree = None
699		if buffer is not None:
700			tree = self.get_parsetree()
701			self._textbuffer = buffer
702			buffer.set_modified(False)
703			self._set_parsetree(tree)
704				# load new tree in buffer, undo-able in 1 step
705				# private method circumvents readonly check !
706			self.set_modified(False)
707		# else do nothing - source will be read with next call to `get_parsetree()`
709	def dump(self, format, linker=None):
710		'''Get content in a specific format
712		Convenience method that converts the current parse tree to a
713		particular format first.
715		@param format: either a format module or a string
716		that is understood by L{zim.formats.get_format()}.
718		@param linker: a linker object (see e.g. L{BaseLinker})
720		@returns: text as a list of lines or an empty list
721		'''
722		if isinstance(format, str):
723			format = zim.formats.get_format(format)
725		if not linker is None:
726			linker.set_path(self)
728		tree = self.get_parsetree()
729		if tree:
730			return format.Dumper(linker=linker).dump(tree)
731		else:
732			return []
734	def parse(self, format, text, append=False):
735		'''Store formatted text in the page
737		Convenience method that parses text and sets the parse tree
738		accordingly.
740		@param format: either a format module or a string
741		that is understood by L{zim.formats.get_format()}.
742		@param text: text as a string or as a list of lines
743		@param append: if C{True} the text is appended instead of
744		replacing current content.
745		'''
746		if isinstance(format, str):
747			format = zim.formats.get_format(format)
749		if append:
750			self.append_parsetree(format.Parser().parse(text))
751		else:
752			self.set_parsetree(format.Parser().parse(text))
754	def get_links(self):
755		'''Generator for links in the page content
757		This method gives the raw links from the content, if you want
758		nice L{Link} objects use
759		L{index.list_links()<zim.index.Index.list_links()>} instead.
761		@returns: yields a list of 3-tuples C{(type, href, attrib)}
762		where:
763		  - C{type} is the link type (e.g. "page" or "file")
764		  - C{href} is the link itself
765		  - C{attrib} is a dict with link properties
766		'''
767		# FIXME optimize with a ParseTree.get_links that does not
768		#       use Node
769		tree = self.get_parsetree()
770		if tree:
771			for elt in tree.findall(zim.formats.LINK):
772				href = elt.attrib.pop('href')
773				type = link_type(href)
774				yield type, href, elt.attrib
776			for elt in tree.findall(zim.formats.IMAGE):
777				if not 'href' in elt.attrib:
778					continue
779				href = elt.attrib.pop('href')
780				type = link_type(href)
781				yield type, href, elt.attrib
784	def get_tags(self):
785		'''Generator for tags in the page content
787		@returns: yields an unordered list of unique 2-tuples
788		C{(name, attrib)} for tags in the parsetree.
789		'''
790		# FIXME optimize with a ParseTree.get_links that does not
791		#       use Node
792		tree = self.get_parsetree()
793		if tree:
794			seen = set()
795			for elt in tree.findall(zim.formats.TAG):
796				name = elt.gettext()
797				if not name in seen:
798					seen.add(name)
799					yield name.lstrip('@'), elt.attrib
801	def get_anchors(self):
802		'''Generator returning all the (explicit) anchors in the page content'''
803		tree = self.get_parsetree()
804		if tree:
805			seen = set()
806			for elt in tree.findall(zim.formats.ANCHOR):
807				name = elt.gettext()
808				if not name in seen:
809					seen.add(name)
810					yield name, elt.attrib
812	def get_title(self):
813		tree = self.get_parsetree()
814		if tree:
815			return tree.get_heading_text() or self.basename
816		else:
817			return self.basename
819	def heading_matches_pagename(self):
820		'''Returns whether the heading matches the page name.
821		Used to determine whether the page should have its heading
822		auto-changed on rename/move.
823		@returns: C{True} when the heading can be auto-changed.
824		'''
825		tree = self.get_parsetree()
826		if tree:
827			return tree.get_heading_text() == self.basename
828		else:
829			return False