1package agents
2
3import (
4	"bytes"
5	"fmt"
6	"strings"
7
8	"github.com/PuerkitoBio/goquery"
9	"github.com/michenriksen/aquatone/core"
10)
11
12type URLPageTitleExtractor struct {
13	session *core.Session
14}
15
16func NewURLPageTitleExtractor() *URLPageTitleExtractor {
17	return &URLPageTitleExtractor{}
18}
19
20func (a *URLPageTitleExtractor) ID() string {
21	return "agent:url_page_title_extractor"
22}
23
24func (a *URLPageTitleExtractor) Register(s *core.Session) error {
25	s.EventBus.SubscribeAsync(core.URLResponsive, a.OnURLResponsive, false)
26	a.session = s
27
28	return nil
29}
30
31func (a *URLPageTitleExtractor) OnURLResponsive(url string) {
32	a.session.Out.Debug("[%s] Received new responsive URL %s\n", a.ID(), url)
33	page := a.session.GetPage(url)
34	if page == nil {
35		a.session.Out.Error("Unable to find page for URL: %s\n", url)
36		return
37	}
38
39	a.session.WaitGroup.Add()
40	go func(page *core.Page) {
41		defer a.session.WaitGroup.Done()
42		body, err := a.session.ReadFile(fmt.Sprintf("html/%s.html", page.BaseFilename()))
43		if err != nil {
44			a.session.Out.Debug("[%s] Error reading HTML body file for %s: %s\n", a.ID(), page.URL, err)
45			return
46		}
47
48		doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
49		if err != nil {
50			a.session.Out.Debug("[%s] Error when parsing HTML body file for %s: %s\n", a.ID(), page.URL, err)
51			return
52		}
53
54		page.PageTitle = strings.TrimSpace(doc.Find("Title").Text())
55	}(page)
56}
57