1# SPDX-FileCopyrightText: 2021 GNOME Foundation
2# SPDX-License-Identifier: Apache-2.0 OR GPL-3.0-or-later
3
4import markdown
5import os
6import re
7import subprocess
8import sys
9
10from markupsafe import Markup
11from pygments import highlight
12from pygments.lexers import get_lexer_by_name
13from pygments.formatters import HtmlFormatter
14from typogrify.filters import typogrify
15
16from . import gir, log, mdext, porter
17
18
19# The beginning of a gtk-doc code block:
20#
21# |[ (optional language identifier)
22#
23CODEBLOCK_START_RE = re.compile(
24    r'''
25    ^
26    \s*
27    \|\[
28    \s*
29    (?P<language>\<\!-- \s* language="\w+" \s* --\>)?
30    \s*
31    $
32    ''',
33    re.UNICODE | re.VERBOSE)
34
35# The optional language identifier for a gtk-doc code block:
36#
37# <!-- language="..." -->
38#
39LANGUAGE_RE = re.compile(
40    r'''
41    ^
42    \s*
43    <!--
44    \s*
45    language="(?P<language>\w+)"
46    \s*
47    -->
48    \s*
49    $
50    ''',
51    re.UNICODE | re.VERBOSE)
52
53# The ending of a gtk-doc code block:
54#
55# ]|
56#
57CODEBLOCK_END_RE = re.compile(
58    r'''
59    ^
60    \s*
61    \]\|
62    \s*
63    $
64    ''',
65    re.UNICODE | re.VERBOSE)
66
67LINK_RE = re.compile(
68    r'''
69    (?P<text>\[ [\w\s,\-_:]+ \])?
70    \[
71    (`)?
72    (?P<fragment>[\w]+)
73    @
74    (?P<endpoint>[\w\-_:\.]+)
75    (`)?
76    \]
77    ''',
78    re.VERBOSE)
79
80TYPE_RE = re.compile(
81    r'''
82    (?P<ns>[\w]+\.)?            # namespace (optional)
83    (?P<name>[\w]+)             # type name
84    ''',
85    re.VERBOSE)
86
87PROPERTY_RE = re.compile(
88    r'''
89    (?P<ns>[\w]+\.)?            # namespace (optional)
90    (?P<name>[\w]+)             # type name
91    :{1}                        # delimiter
92    (?P<property>[\w-]*\w)      # property name
93    ''',
94    re.VERBOSE)
95
96SIGNAL_RE = re.compile(
97    r'''
98    (?P<ns>[\w]+\.)?            # namespace (optional)
99    (?P<name>[\w]+)             # type name
100    :{2}                        # delimiter
101    (?P<signal>[\w-]*\w)        # signal name
102    ''',
103    re.VERBOSE)
104
105METHOD_RE = re.compile(
106    r'''
107    (?P<ns>[\w]+\.)?            # namespace (optional)
108    (?P<name>[\w]+)             # type name
109    \.                          # delimiter
110    (?P<method>[\w_]*\w)        # method name
111    ''',
112    re.VERBOSE)
113
114CAMEL_CASE_START_RE = re.compile(r"([A-Z]+)([A-Z][a-z])")
115
116CAMEL_CASE_CHUNK_RE = re.compile(r"([a-z\d])([A-Z])")
117
118LANGUAGE_MAP = {
119    'c': 'c',
120    'css': 'css',
121    'plain': 'plain',
122    'xml': 'xml',
123    'javascript': 'javascript',
124}
125
126MD_EXTENSIONS = [
127    # Standard extensions
128    'codehilite',
129    'def_list',
130    'fenced_code',
131    'meta',
132    'tables',
133    'toc',
134
135    # Local extensions
136    mdext.GtkDocExtension(),
137]
138
139MD_EXTENSIONS_CONF = {
140    'codehilite': {'guess_lang': False},
141    'toc': {'permalink_class': 'md-anchor', 'permalink': ''},
142}
143
144EN_STOPWORDS = set("""
145a  and  are  as  at
146be  but  by
147for
148if  in  into  is  it
149near  no  not
150of  on  or
151such
152that  the  their  then  there  these  they  this  to
153was  will  with
154""".split())
155
156
157def process_language(lang):
158    if lang is None:
159        return "plain"
160
161    res = LANGUAGE_RE.match(lang)
162    if res:
163        language = res.group("language") or "plain"
164    else:
165        language = "plain"
166
167    return LANGUAGE_MAP[language.lower()]
168
169
170class LinkParseError:
171    def __init__(self, line=None, start=0, end=0, fragment=None, rest=None, message="Unable to parse link"):
172        self.line = line
173        self.start = start
174        self.end = end
175        self.fragment = fragment
176        self.rest = rest
177        self.message = message
178
179    def __str__(self):
180        if self.line is not None:
181            msg = [self.message]
182            msg.append(self.line)
183            err_line = ['^'.rjust(self.start + 1, ' ')]
184            err_line += [''.join(['~' for x in range(self.end - self.start - 1)])]
185            msg.append("".join(err_line))
186            return "\n".join(msg)
187        else:
188            return f"{self.message}: [{self.fragment}@{self.rest}]"
189
190
191class LinkGenerator:
192    def __init__(self, **kwargs):
193        self._line = kwargs.get('line')
194        self._start = kwargs.get('start', 0)
195        self._end = kwargs.get('end', 0)
196        self._namespace = kwargs.get('namespace')
197        self._fragment = kwargs.get('fragment', '')
198        self._endpoint = kwargs.get('endpoint', '')
199        self._no_link = kwargs.get('no_link', False)
200        self._alt_text = kwargs.get('text')
201
202        assert self._namespace is not None
203
204        self._repository = self._namespace.repository
205        self._valid_namespaces = [n for n in self._repository.includes]
206        self._external = False
207
208        fragment_parsers = {
209            "alias": self._parse_type,
210            "callback": self._parse_type,
211            "class": self._parse_type,
212            "const": self._parse_type,
213            "ctor": self._parse_method,
214            "enum": self._parse_type,
215            "error": self._parse_type,
216            "flags": self._parse_type,
217            "func": self._parse_func,
218            "id": self._parse_id,
219            "iface": self._parse_type,
220            "method": self._parse_method,
221            "property": self._parse_property,
222            "signal": self._parse_signal,
223            "struct": self._parse_type,
224            "type": self._parse_type,
225            "vfunc": self._parse_method,
226        }
227
228        parser_method = fragment_parsers.get(self._fragment)
229        if parser_method is not None:
230            res = parser_method(self._fragment)
231            if res is not None:
232                self._fragment = None
233                log.warning(str(res))
234        else:
235            self._fragment = None
236            log.warning(str(LinkParseError(self._line, self._start, self._end,
237                                           self._fragment, self._endpoint,
238                                           "Unable to parse link")))
239
240    def _parse_id(self, fragment):
241        symbol = self._repository.find_symbol(self._endpoint)
242        if symbol is None:
243            return LinkParseError(self._line, self._start, self._end,
244                                  self._fragment, self._endpoint,
245                                  f"Unable to find symbol {self._endpoint}")
246        (ns, t) = symbol
247        if isinstance(t, gir.Class) or \
248           isinstance(t, gir.Interface) or \
249           isinstance(t, gir.Record):
250            self._external = ns is not self._namespace
251            self._ns = ns.name
252            self._fragment = 'method'
253            self._symbol_name = f"{self._endpoint}()"
254            self._name = t.name
255            self._method_name = self._endpoint.replace(ns.symbol_prefix[0] + '_', '')
256            self._method_name = self._method_name.replace(t.symbol_prefix + '_', '')
257            return None
258        elif isinstance(t, gir.Function):
259            self._external = ns is not self._namespace
260            self._ns = ns.name
261            self._fragment = 'func'
262            self._symbol_name = f"{self._endpoint}()"
263            self._name = None
264            self._func_name = self._endpoint.replace(ns.symbol_prefix[0] + '_', '')
265            return None
266        else:
267            return LinkParseError(self._line, self._start, self._end,
268                                  self._fragment, self._endpoint,
269                                  f"Unsupported symbol {self._endpoint}")
270
271    def _parse_type(self, fragment):
272        res = TYPE_RE.match(self._endpoint)
273        if res:
274            ns = res.group('ns')
275            name = res.group('name')
276            if ns is not None:
277                ns = ns[:len(ns) - 1]   # Drop the trailing dot
278            else:
279                ns = self._namespace.name
280                # Accept FooBar in place of Foo.Bar
281                if name.startswith(tuple(self._namespace.identifier_prefix)):
282                    for prefix in self._namespace.identifier_prefix:
283                        name = name.replace(prefix, '')
284        else:
285            return LinkParseError(self._line, self._start, self._end,
286                                  self._fragment, self._endpoint,
287                                  "Invalid type link")
288        if ns == self._namespace.name:
289            namespace = self._namespace
290            self._external = False
291            self._ns = ns
292        else:
293            repository = self._namespace.repository
294            namespace = repository.find_included_namespace(ns)
295            if namespace is not None:
296                self._external = True
297                self._ns = namespace.name
298            else:
299                self._fragment = None
300                return None
301        t = namespace.find_real_type(name)
302        if t is not None and t.base_ctype is not None:
303            if fragment == 'type':
304                if isinstance(t, gir.Alias):
305                    self._fragment = 'alias'
306                elif isinstance(t, gir.BitField):
307                    self._fragment = 'flags'
308                elif isinstance(t, gir.Callback):
309                    self._fragment = 'callback'
310                elif isinstance(t, gir.Class):
311                    self._fragment = 'class'
312                elif isinstance(t, gir.Constant):
313                    self._fragment = 'const'
314                elif isinstance(t, gir.Enumeration):
315                    self._fragment = 'enum'
316                elif isinstance(t, gir.ErrorDomain):
317                    self._fragment = 'error'
318                elif isinstance(t, gir.Interface):
319                    self._fragment = 'iface'
320                elif isinstance(t, gir.Record) or isinstance(t, gir.Union):
321                    self._fragment = 'struct'
322                else:
323                    return LinkParseError(self._line, self._start, self._end,
324                                          self._fragment, self._endpoint,
325                                          f"Invalid type {t} for '{ns}.{name}'")
326            self._name = name
327            self._type = t.base_ctype
328            return None
329        else:
330            return LinkParseError(self._line, self._start, self._end,
331                                  self._fragment, self._endpoint,
332                                  f"Unable to find type '{ns}.{name}'")
333
334    def _parse_property(self, fragment):
335        res = PROPERTY_RE.match(self._endpoint)
336        if res:
337            ns = res.group('ns')
338            name = res.group('name')
339            pname = res.group('property')
340            if ns is not None:
341                ns = ns[:len(ns) - 1]   # Drop the trailing dot
342            else:
343                ns = self._namespace.name
344                # Accept FooBar in place of Foo.Bar
345                if name.startswith(tuple(self._namespace.identifier_prefix)):
346                    for prefix in self._namespace.identifier_prefix:
347                        name = name.replace(prefix, '')
348            # Canonicalize the property name
349            pname = pname.replace('_', '-')
350        else:
351            return LinkParseError(self._line, self._start, self._end,
352                                  self._fragment, self._endpoint,
353                                  "Invalid property link")
354        if ns == self._namespace.name:
355            namespace = self._namespace
356            self._external = False
357            self._ns = ns
358        else:
359            repository = self._namespace.repository
360            namespace = repository.find_included_namespace(ns)
361            if namespace is not None:
362                self._external = True
363                self._ns = ns
364            else:
365                self._fragment = None
366                return None
367        t = namespace.find_real_type(name)
368        if t is not None and t.base_ctype is not None:
369            self._type = t.base_ctype
370            self._name = name
371        else:
372            return LinkParseError(self._line, self._start, self._end,
373                                  self._fragment, self._endpoint,
374                                  f"Unable to find type '{ns}.{name}'")
375        if (isinstance(t, gir.Class) or isinstance(t, gir.Interface)) and pname in t.properties:
376            self._property_name = pname
377        else:
378            return LinkParseError(self._line, self._start, self._end,
379                                  self._fragment, self._endpoint,
380                                  f"Invalid property '{pname}' for type '{ns}.{name}'")
381
382    def _parse_signal(self, fragment):
383        res = SIGNAL_RE.match(self._endpoint)
384        if res:
385            ns = res.group('ns')
386            name = res.group('name')
387            sname = res.group('signal')
388            if ns is not None:
389                ns = ns[:len(ns) - 1]   # Drop the trailing dot
390            else:
391                ns = self._namespace.name
392                # Accept FooBar in place of Foo.Bar
393                if name.startswith(tuple(self._namespace.identifier_prefix)):
394                    for prefix in self._namespace.identifier_prefix:
395                        name = name.replace(prefix, '')
396            # Canonicalize the signal name
397            sname = sname.replace('_', '-')
398        else:
399            return LinkParseError(self._line, self._start, self._end,
400                                  self._fragment, self._endpoint,
401                                  "Invalid signal link")
402        if ns == self._namespace.name:
403            namespace = self._namespace
404            self._external = False
405            self._ns = ns
406        else:
407            repository = self._namespace.repository
408            namespace = repository.find_included_namespace(ns)
409            if namespace is not None:
410                self._external = True
411                self._ns = namespace.name
412            else:
413                self._fragment = None
414                return None
415        t = namespace.find_real_type(name)
416        if t is not None and t.base_ctype is not None:
417            self._type = t.base_ctype
418            self._name = name
419        else:
420            return LinkParseError(self._line, self._start, self._end,
421                                  self._fragment, self._endpoint,
422                                  f"Unable to find type '{ns}.{name}'")
423        if (isinstance(t, gir.Class) or isinstance(t, gir.Interface)) and sname in t.signals:
424            self._signal_name = sname
425        else:
426            return LinkParseError(self._line, self._start, self._end,
427                                  self._fragment, self._endpoint,
428                                  f"Invalid signal name '{sname}' for type '{ns}.{name}'")
429
430    def _parse_method(self, fragment):
431        res = METHOD_RE.match(self._endpoint)
432        if res:
433            ns = res.group('ns')
434            name = res.group('name')
435            method = res.group('method')
436            if ns is not None:
437                ns = ns[:len(ns) - 1]   # Drop the trailing dot
438            else:
439                ns = self._namespace.name
440                # Accept FooBar in place of Foo.Bar
441                if name.startswith(tuple(self._namespace.identifier_prefix)):
442                    for prefix in self._namespace.identifier_prefix:
443                        name = name.replace(prefix, '')
444        else:
445            return LinkParseError(self._line, self._start, self._end,
446                                  self._fragment, self._endpoint,
447                                  "Invalid method link")
448        if ns == self._namespace.name:
449            namespace = self._namespace
450            self._external = False
451            self._ns = ns
452        else:
453            repository = self._namespace.repository
454            namespace = repository.find_included_namespace(ns)
455            if namespace is not None:
456                self._ns = namespace.name
457                self._external = True
458            else:
459                self._fragment = None
460                return None
461        t = namespace.find_real_type(name)
462        if t is not None and t.base_ctype is not None:
463            self._type = t.base_ctype
464            self._method_name = method
465            # method@Foo.BarClass.add_name -> class_method.Bar.add_name.html
466            if isinstance(t, gir.Record) and t.struct_for is not None:
467                self._name = t.struct_for
468                self._fragment = "class_method"
469            elif fragment == "vfunc" and t.type_struct is not None:
470                self._name = name
471                self._type = t.type_struct
472            else:
473                self._name = name
474        else:
475            return LinkParseError(self._line, self._start, self._end,
476                                  self._fragment, self._endpoint,
477                                  f"Unable to find type '{ns}.{name}'")
478        if fragment == "ctor":
479            methods = getattr(t, "constructors", [])
480        elif fragment in ["method", "class_method"]:
481            methods = getattr(t, "methods", [])
482        elif fragment == "vfunc":
483            methods = getattr(t, "virtual_methods", [])
484        else:
485            methods = []
486        for m in methods:
487            if m.name == method:
488                if fragment == "vfunc":
489                    self._vfunc_name = m.name
490                else:
491                    self._symbol_name = f"{m.identifier}()"
492                return None
493        return LinkParseError(self._line, self._start, self._end,
494                              self._fragment, self._endpoint,
495                              f"Unable to find method '{ns}.{name}.{method}'")
496
497    def _parse_func(self, fragment):
498        tokens = self._endpoint.split('.')
499        # Case 1: [func@init] => gtk_init()
500        if len(tokens) == 1:
501            ns = self._namespace.name
502            name = None
503            func_name = tokens[0]
504        # Case 2: [func@Gtk.Foo.bar] => gtk_foo_bar()
505        elif len(tokens) == 3:
506            ns = tokens[0]
507            name = tokens[1]
508            func_name = tokens[2]
509        # Case 3: either [func@Gtk.init] or [func@Foo.bar]
510        elif len(tokens) == 2:
511            if tokens[0] == self._namespace.name:
512                ns = tokens[0]
513                name = None
514                func_name = tokens[1]
515            elif tokens[0] in self._valid_namespaces:
516                ns = tokens[0]
517                name = None
518                func_name = tokens[1]
519            else:
520                ns = self._namespace.name
521                name = tokens[0]
522                func_name = tokens[1]
523        else:
524            return LinkParseError(self._line, self._start, self._end,
525                                  self._fragment, self._endpoint,
526                                  "Invalid function link")
527        if ns == self._namespace.name:
528            namespace = self._namespace
529            self._external = False
530            self._ns = ns
531        else:
532            repository = self._namespace.repository
533            namespace = repository.find_included_namespace(ns)
534            if namespace is not None:
535                self._external = True
536                self._ns = namespace.name
537            else:
538                self._fragment = None
539                log.warning(f"Namespace {ns} not found for link {self._endpoint}")
540                return None
541        if name is None:
542            t = namespace.find_function(func_name)
543            if t is not None:
544                self._name = None
545                self._func_name = func_name
546                self._symbol_name = f"{t.identifier}()"
547                return None
548            else:
549                return LinkParseError(self._line, self._start, self._end,
550                                      self._fragment, self._endpoint,
551                                      f"Unable to find function '{ns}.{func_name}'")
552        else:
553            t = namespace.find_real_type(name)
554            if t is None:
555                return LinkParseError(self._line, self._start, self._end,
556                                      self._fragment, self._endpoint,
557                                      f"Unable to find type '{ns}.{name}'")
558            for func in t.functions:
559                if func.name == func_name:
560                    self._name = name
561                    self._func_name = func.name
562                    self._symbol_name = f"{func.identifier}()"
563                    return None
564            return LinkParseError(self._line, self._start, self._end,
565                                  self._fragment, self._endpoint,
566                                  f"Unable to find function '{ns}.{name}.{func_name}'")
567
568    @property
569    def text(self):
570        if self._alt_text is not None:
571            return self._alt_text[1:len(self._alt_text) - 1]
572        elif self._fragment in ['alias', 'class', 'const', 'enum', 'error', 'flags', 'iface', 'struct']:
573            return f"<code>{self._type}</code>"
574        elif self._fragment == 'property':
575            return f"<code>{self._type}:{self._property_name}</code>"
576        elif self._fragment == 'signal':
577            return f"<code>{self._type}::{self._signal_name}</code>"
578        elif self._fragment in ['ctor', 'func', 'method', 'class_method']:
579            return f"{self._symbol_name}"
580        elif self._fragment == 'vfunc':
581            return f"<code>{self._ns}.{self._type}.{self._vfunc_name}</code>"
582        else:
583            return f"{self._endpoint}"
584
585    @property
586    def href(self):
587        if self._fragment in ['alias', 'class', 'const', 'enum', 'error', 'flags', 'iface', 'struct']:
588            return f"{self._fragment}.{self._name}.html"
589        elif self._fragment == 'property':
590            return f"property.{self._name}.{self._property_name}.html"
591        elif self._fragment == 'signal':
592            return f"signal.{self._name}.{self._signal_name}.html"
593        elif self._fragment in ['ctor', 'method', 'class_method', 'vfunc']:
594            return f"{self._fragment}.{self._name}.{self._method_name}.html"
595        elif self._fragment == 'func':
596            if self._name is not None:
597                return f"type_func.{self._name}.{self._func_name}.html"
598            else:
599                return f"func.{self._func_name}.html"
600        else:
601            return None
602
603    def __str__(self):
604        text = self.text
605        if self._no_link:
606            return text
607        link = self.href
608        if link is None:
609            return text
610        if self._external:
611            data_namespace = f"data-namespace=\"{self._ns}\""
612            data_link = f"data-link=\"{link}\""
613            href = "href=\"javascript:void(0)\""
614            css = "class=\"external\""
615            return f"<a {href} {data_namespace} {data_link} {css}>{text}</a>"
616        else:
617            return f"<a href=\"{link}\">{text}</a>"
618
619
620def preprocess_docs(text, namespace, summary=False, md=None, extensions=[], plain=False, max_length=10):
621    if plain:
622        text = text.replace('\n', ' ')
623        text = re.sub(r'<[^<]+?>', '', text)
624        if max_length > 0:
625            words = text.split(' ')
626            if len(words) > max_length:
627                words = words[:max_length - 1]
628                words.append('...')
629                text = ' '.join(words)
630        return text
631
632    processed_text = []
633
634    code_block_text = []
635    code_block_language = None
636    inside_code_block = False
637
638    for line in text.split("\n"):
639        # If we're in "summary" mode, we bail out at the first empty line
640        # after a paragraph
641        if summary and line == '' and len(processed_text) > 0:
642            break
643
644        res = CODEBLOCK_START_RE.match(line)
645        if res:
646            code_block_language = process_language(res.group("language"))
647            inside_code_block = True
648            continue
649
650        res = CODEBLOCK_END_RE.match(line)
651        if res and inside_code_block:
652            if code_block_language == "plain":
653                processed_text += ["```"]
654                processed_text.extend(code_block_text)
655                processed_text += ["```"]
656            else:
657                lexer = get_lexer_by_name(code_block_language)
658                formatter = HtmlFormatter()
659                code_block = highlight("\n".join(code_block_text), lexer, formatter)
660                processed_text += [""]
661                processed_text.extend(code_block.split("\n"))
662                processed_text += [""]
663
664            code_block_language = None
665            code_block_text = []
666            inside_code_block = False
667            continue
668
669        if inside_code_block:
670            code_block_text.append(line)
671        else:
672            new_line = []
673            idx = 0
674            for m in LINK_RE.finditer(line, idx):
675                fragment = m.group('fragment')
676                endpoint = m.group('endpoint')
677                text = m.group('text')
678                start = m.start()
679                end = m.end()
680                link = LinkGenerator(line=line, start=start, end=end,
681                                     namespace=namespace,
682                                     fragment=fragment, endpoint=endpoint,
683                                     no_link=summary, text=text)
684                left_pad = line[idx:start]
685                replacement = re.sub(LINK_RE, str(link), line[start:end])
686                new_line.append(left_pad)
687                new_line.append(replacement)
688                idx = end
689            new_line.append(line[idx:])
690
691            if len(new_line) == 0:
692                processed_text.append(line)
693            else:
694                processed_text.append("".join(new_line))
695
696    if len(processed_text) == 0:
697        return ''
698
699    # Capitalize the first character of the first line, but only if it does not
700    # start with a link or a gtk-doc marker, to avoid messing up the rest of
701    # the string
702    first_line = processed_text[0]
703    if first_line and first_line[0].isalpha():
704        processed_text[0] = ''.join([first_line[0:1].upper(), first_line[1:]])
705
706    # Append a period, if one isn't there already
707    last_line = processed_text[-1]
708    if last_line and last_line[-1].isalpha():
709        processed_text[-1] = ''.join([last_line, '.'])
710
711    if md is None:
712        md_ext = extensions.copy()
713        md_ext.extend(MD_EXTENSIONS)
714        text = markdown.markdown("\n".join(processed_text),
715                                 extensions=md_ext,
716                                 extension_configs=MD_EXTENSIONS_CONF)
717    else:
718        text = md.reset().convert("\n".join(processed_text))
719
720    return Markup(typogrify(text, ignore_tags=['h1', 'h2', 'h3', 'h4']))
721
722
723def stem(word, stemmer=None):
724    if stemmer is None:
725        stemmer = porter.PorterStemmer()
726    return stemmer.stem(word, 0, len(word) - 1)
727
728
729def index_description(text, stemmer=None):
730    processed_text = []
731
732    inside_code_block = False
733    for line in text.split("\n"):
734        if not inside_code_block and (line.startswith('```') or line.startswith('|[')):
735            inside_code_block = True
736            continue
737
738        if inside_code_block and (line.startswith('```') or line.startswith(']|')):
739            inside_code_block = False
740            continue
741
742        if not inside_code_block:
743            processed_text.append(line)
744
745    data = " ".join(processed_text)
746    terms = set()
747    for chunk in data.split(" "):
748        chunk = chunk.lower()
749        if chunk in ["\n", "\r", "\r\n"]:
750            continue
751        # Skip gtk-doc sygils
752        if chunk.startswith('%') or chunk.startswith('#') or chunk.startswith('@') or chunk.endswith('()'):
753            continue
754        # Skip gi-docgen links
755        if chunk.startswith('[') and chunk.endswith(']') and '@' in chunk:
756            continue
757        # Skip images
758        if chunk.startswith('!['):
759            continue
760        if chunk in EN_STOPWORDS:
761            continue
762        chunk = re.sub(r"`(\w+)`", r"\g<1>", chunk)
763        chunk = re.sub(r"[,\.:;`]$", '', chunk)
764        chunk = re.sub(r"[\(\)]+", '', chunk)
765        terms.add(stem(chunk, stemmer))
766    return terms
767
768
769def canonicalize(symbol):
770    return symbol.replace('-', '_')
771
772
773def index_identifier(symbol, stemmer=None):
774    """Chunks an identifier (e.g. EventControllerClik) into terms useful for indexing."""
775    symbol = re.sub(CAMEL_CASE_START_RE, r"\g<1>_\g<2>", symbol)
776    symbol = re.sub(CAMEL_CASE_CHUNK_RE, r"\g<1>_\g<2>", symbol)
777    symbol = symbol.replace('-', '_')
778    symbol = symbol.lower()
779    terms = set()
780    for chunk in symbol.split('_'):
781        if chunk in EN_STOPWORDS:
782            continue
783        terms.add(stem(chunk, stemmer))
784    return terms
785
786
787def index_symbol(symbol, stemmer=None):
788    """Chunks a symbol (e.g. set_layout_manager) into terms useful for indexing."""
789    terms = set()
790    for chunk in canonicalize(symbol).split('_'):
791        if chunk in EN_STOPWORDS:
792            continue
793        terms.add(stem(chunk, stemmer))
794    return terms
795
796
797def code_highlight(text, language='c'):
798    lexer = get_lexer_by_name(language)
799    formatter = HtmlFormatter()
800    return Markup(highlight(text, lexer, formatter))
801
802
803def render_dot(dot, output_format="svg"):
804    if output_format not in ["svg", "png"]:
805        log.error("Invalid output format for render_dot(): {output_format}")
806
807    args = ["dot", f"-T{output_format}"]
808
809    try:
810        proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
811        proc.stdin.write(dot.encode("utf-8"))
812        output, err = proc.communicate()
813        if err:
814            log.warning(f"Unable to process dot data: {err}")
815            return None
816        if output_format == "svg":
817            return output.decode("utf-8")
818    except Exception as e:
819        log.warning(f"Unable to process dot data: {e}")
820        return None
821
822
823found_programs = {}
824
825
826def find_program(bin_name, path=None):
827    """Finds a program @bin_name inside the given @path, and returns
828    its full path if found, or None if the program could not be found.
829
830    The @bin_name will automatically get an extension depending on the
831    platform.
832    """
833    global found_programs
834
835    if path is None and bin_name in found_programs:
836        return found_programs[bin_name]
837
838    if path is None:
839        search_paths = os.environ['PATH'].split(os.pathsep)
840    else:
841        search_paths = path.split(os.pathsep)
842
843    bin_extensions = ['']
844
845    if sys.platform == 'win32':
846        pathext = os.environ['PATHEXT'].lower().split(os.pathsep)
847        (basename, extension) = os.path.splitext(bin_name)
848        if extension.lower() not in pathext:
849            bin_extensions = pathext
850        search_paths.insert(0, '')
851
852    for ext in bin_extensions:
853        executable = bin_name + ext
854
855        for p in search_paths:
856            full_path = os.path.join(p, executable)
857            if os.path.isfile(full_path):
858                # Memoize the result with the default PATH, so we can
859                # call this multiple times at no additional cost
860                if path is None:
861                    found_programs[bin_name] = full_path
862                return full_path
863
864    return None
865
866
867def default_search_paths():
868    if not sys.platform == 'win32':
869        xdg_data_dirs = os.environ.get("XDG_DATA_DIRS", "/usr/share:/usr/local/share").split(":")
870        xdg_data_home = os.environ.get("XDG_DATA_HOME", os.path.expanduser("~/.local/share"))
871    else:
872        xdg_data_dirs = None
873        xdg_data_home = None
874
875    paths = []
876    paths.append(os.getcwd())
877    # Add sys.base_prefix when using MSYS2
878    if sys.platform == 'win32' and 'GCC' in sys.version:
879        paths.append(os.path.join(sys.base_prefix, 'share', 'gir-1.0'))
880    if xdg_data_home is not None:
881        paths.append(os.path.join(xdg_data_home, "gir-1.0"))
882    if xdg_data_dirs is not None:
883        paths.extend([os.path.join(x, "gir-1.0") for x in xdg_data_dirs])
884
885    return paths
886
887
888def find_extra_content_file(content_dirs, file):
889    for p in content_dirs:
890        full_path = os.path.join(p, file)
891        if os.path.isfile(full_path):
892            return full_path
893
894    raise FileNotFoundError(f"Content file {file} not found in any content directory")
895