1package agents 2 3import ( 4 "bytes" 5 "fmt" 6 "strings" 7 8 "github.com/PuerkitoBio/goquery" 9 "github.com/michenriksen/aquatone/core" 10) 11 12type URLPageTitleExtractor struct { 13 session *core.Session 14} 15 16func NewURLPageTitleExtractor() *URLPageTitleExtractor { 17 return &URLPageTitleExtractor{} 18} 19 20func (a *URLPageTitleExtractor) ID() string { 21 return "agent:url_page_title_extractor" 22} 23 24func (a *URLPageTitleExtractor) Register(s *core.Session) error { 25 s.EventBus.SubscribeAsync(core.URLResponsive, a.OnURLResponsive, false) 26 a.session = s 27 28 return nil 29} 30 31func (a *URLPageTitleExtractor) OnURLResponsive(url string) { 32 a.session.Out.Debug("[%s] Received new responsive URL %s\n", a.ID(), url) 33 page := a.session.GetPage(url) 34 if page == nil { 35 a.session.Out.Error("Unable to find page for URL: %s\n", url) 36 return 37 } 38 39 a.session.WaitGroup.Add() 40 go func(page *core.Page) { 41 defer a.session.WaitGroup.Done() 42 body, err := a.session.ReadFile(fmt.Sprintf("html/%s.html", page.BaseFilename())) 43 if err != nil { 44 a.session.Out.Debug("[%s] Error reading HTML body file for %s: %s\n", a.ID(), page.URL, err) 45 return 46 } 47 48 doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) 49 if err != nil { 50 a.session.Out.Debug("[%s] Error when parsing HTML body file for %s: %s\n", a.ID(), page.URL, err) 51 return 52 } 53 54 page.PageTitle = strings.TrimSpace(doc.Find("Title").Text()) 55 }(page) 56} 57