178 lines
4.9 KiB
Go
178 lines
4.9 KiB
Go
package pwextractor
|
|
|
|
import (
|
|
"fmt"
|
|
"github.com/egor3f/rssalchemy/internal/models"
|
|
"github.com/labstack/gommon/log"
|
|
"github.com/markusmobius/go-dateparser"
|
|
"github.com/playwright-community/playwright-go"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type PwExtractor struct {
|
|
pw *playwright.Playwright
|
|
chrome playwright.Browser
|
|
}
|
|
|
|
func New() (*PwExtractor, error) {
|
|
e := PwExtractor{}
|
|
var err error
|
|
e.pw, err = playwright.Run()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("run playwright: %w", err)
|
|
}
|
|
e.chrome, err = e.pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
|
|
ChromiumSandbox: playwright.Bool(true),
|
|
HandleSIGINT: playwright.Bool(false),
|
|
Timeout: pwDuration("5s"),
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("run chromium: %w", err)
|
|
}
|
|
return &e, nil
|
|
}
|
|
|
|
func (e *PwExtractor) Stop() error {
|
|
if err := e.chrome.Close(); err != nil {
|
|
return fmt.Errorf("closing chrome: %w", err)
|
|
}
|
|
if err := e.pw.Stop(); err != nil {
|
|
return fmt.Errorf("stopping playwright: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
|
|
page, err := e.chrome.NewPage()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("browser new page: %w", err)
|
|
}
|
|
defer func() {
|
|
err := page.Close()
|
|
if err != nil {
|
|
errRet = fmt.Errorf("close page: %w; other error=%w", err, errRet)
|
|
}
|
|
}()
|
|
log.Debugf("Page opened")
|
|
|
|
if _, err := page.Goto(task.URL); err != nil {
|
|
return nil, fmt.Errorf("goto page: %w", err)
|
|
}
|
|
log.Debugf("Url %s visited", task.URL)
|
|
|
|
if err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
|
State: playwright.LoadStateNetworkidle,
|
|
Timeout: pwDuration("5s"),
|
|
}); err != nil {
|
|
log.Warnf("waiting for page load: %v", err)
|
|
}
|
|
|
|
result = &models.TaskResult{}
|
|
|
|
result.Title, err = page.Title()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("page title: %w", err)
|
|
}
|
|
|
|
iconUrl, err := page.Locator("link[rel=apple-touch-icon]").First().
|
|
GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")})
|
|
if err != nil {
|
|
log.Warnf("page icon url: %v", err)
|
|
} else {
|
|
result.Icon = absUrl(iconUrl, page)
|
|
}
|
|
|
|
posts, err := page.Locator(task.SelectorPost).All()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("post locator: %w", err)
|
|
}
|
|
if len(posts) == 0 {
|
|
return nil, fmt.Errorf("no posts on page")
|
|
}
|
|
for _, post := range posts {
|
|
item, err := e.extractPost(task, post)
|
|
if err != nil {
|
|
log.Errorf("extract post fields: %v", err)
|
|
continue
|
|
}
|
|
if len(item.Title) == 0 || len(item.Link) == 0 {
|
|
log.Warnf("post has no required fields, skip")
|
|
continue
|
|
}
|
|
result.Items = append(result.Items, item)
|
|
}
|
|
if len(result.Items) == 0 {
|
|
return nil, fmt.Errorf("extract failed for all posts")
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
func (e *PwExtractor) extractPost(task models.Task, post playwright.Locator) (models.FeedItem, error) {
|
|
fieldIdx := 0
|
|
must := func(s string, err error) string {
|
|
fieldIdx++
|
|
if err != nil {
|
|
log.Errorf("extract post field %d: %v", fieldIdx, err)
|
|
return ""
|
|
}
|
|
log.Debugf("field=%d res=%.100s", fieldIdx, s)
|
|
return s
|
|
}
|
|
var item models.FeedItem
|
|
const defTimeout = "100ms"
|
|
defOpt := playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)}
|
|
defOptAttr := playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)}
|
|
log.Debugf("---- POST: ----")
|
|
|
|
item.Title = must(post.Locator(task.SelectorTitle).First().TextContent(defOpt))
|
|
|
|
item.Link = must(post.Locator(task.SelectorLink).First().GetAttribute("href", defOptAttr))
|
|
page, _ := post.Page()
|
|
item.Link = absUrl(item.Link, page)
|
|
|
|
item.Description = must(post.Locator(task.SelectorDescription).First().TextContent(defOpt))
|
|
|
|
item.AuthorName = must(post.Locator(task.SelectorAuthor).First().TextContent(defOpt))
|
|
|
|
item.AuthorLink = must(post.Locator(task.SelectorAuthor).First().GetAttribute("href", defOptAttr))
|
|
item.AuthorLink = absUrl(item.AuthorLink, page)
|
|
|
|
item.Content = must(post.Locator(task.SelectorContent).First().TextContent(defOpt))
|
|
|
|
item.Enclosure = must(post.Locator(task.SelectorEnclosure).First().GetAttribute("src", defOptAttr))
|
|
|
|
createdDateStr := must(post.Locator(task.SelectorCreated).First().TextContent(defOpt))
|
|
log.Debugf("date=%s", createdDateStr)
|
|
createdDate, err := dateparser.Parse(nil, createdDateStr)
|
|
if err != nil {
|
|
log.Errorf("dateparser: %v", err)
|
|
} else {
|
|
item.Created = createdDate.Time
|
|
}
|
|
|
|
return item, nil
|
|
}
|
|
|
|
func absUrl(link string, page playwright.Page) string {
|
|
if strings.HasPrefix(link, "/") {
|
|
pageUrl, _ := url.Parse(page.URL())
|
|
link = fmt.Sprintf("%s://%s%s", pageUrl.Scheme, pageUrl.Host, link)
|
|
}
|
|
log.Debugf("link=%s", link)
|
|
return link
|
|
}
|
|
|
|
// pwDuration converts string like "10s" to milliseconds float64 pointer
|
|
// needed for Playwright timeouts (wtf? why they don't use normal Durations?)
|
|
func pwDuration(s string) *float64 {
|
|
dur, err := time.ParseDuration(s)
|
|
if err != nil {
|
|
panic(fmt.Errorf("failed to parse duration %s: %w", s, err))
|
|
}
|
|
f64 := float64(dur.Milliseconds())
|
|
return &f64
|
|
}
|