1// Copyright 2019 The Hugo Authors. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14// Package helpers implements general utility functions that work with
15// and on content.  The helper functions defined here lay down the
16// foundation of how Hugo works with files and filepaths, and perform
17// string operations on content.
18package helpers
19
20import (
21	"bytes"
22	"html/template"
23	"strings"
24	"unicode"
25	"unicode/utf8"
26
27	"github.com/gohugoio/hugo/common/hexec"
28	"github.com/gohugoio/hugo/common/loggers"
29
30	"github.com/spf13/afero"
31
32	"github.com/gohugoio/hugo/markup/converter"
33
34	"github.com/gohugoio/hugo/markup"
35
36	bp "github.com/gohugoio/hugo/bufferpool"
37	"github.com/gohugoio/hugo/config"
38)
39
40// SummaryDivider denotes where content summarization should end. The default is "<!--more-->".
41var SummaryDivider = []byte("<!--more-->")
42
43var (
44	openingPTag        = []byte("<p>")
45	closingPTag        = []byte("</p>")
46	paragraphIndicator = []byte("<p")
47	closingIndicator   = []byte("</")
48)
49
50// ContentSpec provides functionality to render markdown content.
51type ContentSpec struct {
52	Converters          markup.ConverterProvider
53	MardownConverter    converter.Converter // Markdown converter with no document context
54	anchorNameSanitizer converter.AnchorNameSanitizer
55
56	// SummaryLength is the length of the summary that Hugo extracts from a content.
57	summaryLength int
58
59	BuildFuture  bool
60	BuildExpired bool
61	BuildDrafts  bool
62
63	Cfg config.Provider
64}
65
66// NewContentSpec returns a ContentSpec initialized
67// with the appropriate fields from the given config.Provider.
68func NewContentSpec(cfg config.Provider, logger loggers.Logger, contentFs afero.Fs, ex *hexec.Exec) (*ContentSpec, error) {
69	spec := &ContentSpec{
70		summaryLength: cfg.GetInt("summaryLength"),
71		BuildFuture:   cfg.GetBool("buildFuture"),
72		BuildExpired:  cfg.GetBool("buildExpired"),
73		BuildDrafts:   cfg.GetBool("buildDrafts"),
74
75		Cfg: cfg,
76	}
77
78	converterProvider, err := markup.NewConverterProvider(converter.ProviderConfig{
79		Cfg:       cfg,
80		ContentFs: contentFs,
81		Logger:    logger,
82		Exec:      ex,
83	})
84	if err != nil {
85		return nil, err
86	}
87
88	spec.Converters = converterProvider
89	p := converterProvider.Get("markdown")
90	conv, err := p.New(converter.DocumentContext{})
91	if err != nil {
92		return nil, err
93	}
94	spec.MardownConverter = conv
95	if as, ok := conv.(converter.AnchorNameSanitizer); ok {
96		spec.anchorNameSanitizer = as
97	} else {
98		// Use Goldmark's sanitizer
99		p := converterProvider.Get("goldmark")
100		conv, err := p.New(converter.DocumentContext{})
101		if err != nil {
102			return nil, err
103		}
104		spec.anchorNameSanitizer = conv.(converter.AnchorNameSanitizer)
105	}
106
107	return spec, nil
108}
109
110var stripHTMLReplacer = strings.NewReplacer("\n", " ", "</p>", "\n", "<br>", "\n", "<br />", "\n")
111
112// StripHTML accepts a string, strips out all HTML tags and returns it.
113func StripHTML(s string) string {
114	// Shortcut strings with no tags in them
115	if !strings.ContainsAny(s, "<>") {
116		return s
117	}
118	s = stripHTMLReplacer.Replace(s)
119
120	// Walk through the string removing all tags
121	b := bp.GetBuffer()
122	defer bp.PutBuffer(b)
123	var inTag, isSpace, wasSpace bool
124	for _, r := range s {
125		if !inTag {
126			isSpace = false
127		}
128
129		switch {
130		case r == '<':
131			inTag = true
132		case r == '>':
133			inTag = false
134		case unicode.IsSpace(r):
135			isSpace = true
136			fallthrough
137		default:
138			if !inTag && (!isSpace || (isSpace && !wasSpace)) {
139				b.WriteRune(r)
140			}
141		}
142
143		wasSpace = isSpace
144
145	}
146	return b.String()
147}
148
149// stripEmptyNav strips out empty <nav> tags from content.
150func stripEmptyNav(in []byte) []byte {
151	return bytes.Replace(in, []byte("<nav>\n</nav>\n\n"), []byte(``), -1)
152}
153
154// BytesToHTML converts bytes to type template.HTML.
155func BytesToHTML(b []byte) template.HTML {
156	return template.HTML(string(b))
157}
158
159// ExtractTOC extracts Table of Contents from content.
160func ExtractTOC(content []byte) (newcontent []byte, toc []byte) {
161	if !bytes.Contains(content, []byte("<nav>")) {
162		return content, nil
163	}
164	origContent := make([]byte, len(content))
165	copy(origContent, content)
166	first := []byte(`<nav>
167<ul>`)
168
169	last := []byte(`</ul>
170</nav>`)
171
172	replacement := []byte(`<nav id="TableOfContents">
173<ul>`)
174
175	startOfTOC := bytes.Index(content, first)
176
177	peekEnd := len(content)
178	if peekEnd > 70+startOfTOC {
179		peekEnd = 70 + startOfTOC
180	}
181
182	if startOfTOC < 0 {
183		return stripEmptyNav(content), toc
184	}
185	// Need to peek ahead to see if this nav element is actually the right one.
186	correctNav := bytes.Index(content[startOfTOC:peekEnd], []byte(`<li><a href="#`))
187	if correctNav < 0 { // no match found
188		return content, toc
189	}
190	lengthOfTOC := bytes.Index(content[startOfTOC:], last) + len(last)
191	endOfTOC := startOfTOC + lengthOfTOC
192
193	newcontent = append(content[:startOfTOC], content[endOfTOC:]...)
194	toc = append(replacement, origContent[startOfTOC+len(first):endOfTOC]...)
195	return
196}
197
198func (c *ContentSpec) RenderMarkdown(src []byte) ([]byte, error) {
199	b, err := c.MardownConverter.Convert(converter.RenderContext{Src: src})
200	if err != nil {
201		return nil, err
202	}
203	return b.Bytes(), nil
204}
205
206func (c *ContentSpec) SanitizeAnchorName(s string) string {
207	return c.anchorNameSanitizer.SanitizeAnchorName(s)
208}
209
210func (c *ContentSpec) ResolveMarkup(in string) string {
211	in = strings.ToLower(in)
212	switch in {
213	case "md", "markdown", "mdown":
214		return "markdown"
215	case "html", "htm":
216		return "html"
217	default:
218		if in == "mmark" {
219			Deprecated("Markup type mmark", "See https://gohugo.io//content-management/formats/#list-of-content-formats", true)
220		}
221		if conv := c.Converters.Get(in); conv != nil {
222			return conv.Name()
223		}
224	}
225	return ""
226}
227
228// TotalWords counts instance of one or more consecutive white space
229// characters, as defined by unicode.IsSpace, in s.
230// This is a cheaper way of word counting than the obvious len(strings.Fields(s)).
231func TotalWords(s string) int {
232	n := 0
233	inWord := false
234	for _, r := range s {
235		wasInWord := inWord
236		inWord = !unicode.IsSpace(r)
237		if inWord && !wasInWord {
238			n++
239		}
240	}
241	return n
242}
243
244// TruncateWordsByRune truncates words by runes.
245func (c *ContentSpec) TruncateWordsByRune(in []string) (string, bool) {
246	words := make([]string, len(in))
247	copy(words, in)
248
249	count := 0
250	for index, word := range words {
251		if count >= c.summaryLength {
252			return strings.Join(words[:index], " "), true
253		}
254		runeCount := utf8.RuneCountInString(word)
255		if len(word) == runeCount {
256			count++
257		} else if count+runeCount < c.summaryLength {
258			count += runeCount
259		} else {
260			for ri := range word {
261				if count >= c.summaryLength {
262					truncatedWords := append(words[:index], word[:ri])
263					return strings.Join(truncatedWords, " "), true
264				}
265				count++
266			}
267		}
268	}
269
270	return strings.Join(words, " "), false
271}
272
273// TruncateWordsToWholeSentence takes content and truncates to whole sentence
274// limited by max number of words. It also returns whether it is truncated.
275func (c *ContentSpec) TruncateWordsToWholeSentence(s string) (string, bool) {
276	var (
277		wordCount     = 0
278		lastWordIndex = -1
279	)
280
281	for i, r := range s {
282		if unicode.IsSpace(r) {
283			wordCount++
284			lastWordIndex = i
285
286			if wordCount >= c.summaryLength {
287				break
288			}
289
290		}
291	}
292
293	if lastWordIndex == -1 {
294		return s, false
295	}
296
297	endIndex := -1
298
299	for j, r := range s[lastWordIndex:] {
300		if isEndOfSentence(r) {
301			endIndex = j + lastWordIndex + utf8.RuneLen(r)
302			break
303		}
304	}
305
306	if endIndex == -1 {
307		return s, false
308	}
309
310	return strings.TrimSpace(s[:endIndex]), endIndex < len(s)
311}
312
313// TrimShortHTML removes the <p>/</p> tags from HTML input in the situation
314// where said tags are the only <p> tags in the input and enclose the content
315// of the input (whitespace excluded).
316func (c *ContentSpec) TrimShortHTML(input []byte) []byte {
317	firstOpeningP := bytes.Index(input, paragraphIndicator)
318	lastOpeningP := bytes.LastIndex(input, paragraphIndicator)
319
320	lastClosingP := bytes.LastIndex(input, closingPTag)
321	lastClosing := bytes.LastIndex(input, closingIndicator)
322
323	if firstOpeningP == lastOpeningP && lastClosingP == lastClosing {
324		input = bytes.TrimSpace(input)
325		input = bytes.TrimPrefix(input, openingPTag)
326		input = bytes.TrimSuffix(input, closingPTag)
327		input = bytes.TrimSpace(input)
328	}
329	return input
330}
331
332func isEndOfSentence(r rune) bool {
333	return r == '.' || r == '?' || r == '!' || r == '"' || r == '\n'
334}
335
336// Kept only for benchmark.
337func (c *ContentSpec) truncateWordsToWholeSentenceOld(content string) (string, bool) {
338	words := strings.Fields(content)
339
340	if c.summaryLength >= len(words) {
341		return strings.Join(words, " "), false
342	}
343
344	for counter, word := range words[c.summaryLength:] {
345		if strings.HasSuffix(word, ".") ||
346			strings.HasSuffix(word, "?") ||
347			strings.HasSuffix(word, ".\"") ||
348			strings.HasSuffix(word, "!") {
349			upper := c.summaryLength + counter + 1
350			return strings.Join(words[:upper], " "), (upper < len(words))
351		}
352	}
353
354	return strings.Join(words[:c.summaryLength], " "), true
355}
356