2025-01-15 18:46:16 +03:00

178 lines
4.9 KiB
Go

package pwextractor
import (
"fmt"
"github.com/egor3f/rssalchemy/internal/models"
"github.com/labstack/gommon/log"
"github.com/markusmobius/go-dateparser"
"github.com/playwright-community/playwright-go"
"net/url"
"strings"
"time"
)
type PwExtractor struct {
pw *playwright.Playwright
chrome playwright.Browser
}
func New() (*PwExtractor, error) {
e := PwExtractor{}
var err error
e.pw, err = playwright.Run()
if err != nil {
return nil, fmt.Errorf("run playwright: %w", err)
}
e.chrome, err = e.pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
ChromiumSandbox: playwright.Bool(true),
HandleSIGINT: playwright.Bool(false),
Timeout: pwDuration("5s"),
})
if err != nil {
return nil, fmt.Errorf("run chromium: %w", err)
}
return &e, nil
}
func (e *PwExtractor) Stop() error {
if err := e.chrome.Close(); err != nil {
return fmt.Errorf("closing chrome: %w", err)
}
if err := e.pw.Stop(); err != nil {
return fmt.Errorf("stopping playwright: %w", err)
}
return nil
}
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
page, err := e.chrome.NewPage()
if err != nil {
return nil, fmt.Errorf("browser new page: %w", err)
}
defer func() {
err := page.Close()
if err != nil {
errRet = fmt.Errorf("close page: %w; other error=%w", err, errRet)
}
}()
log.Debugf("Page opened")
if _, err := page.Goto(task.URL); err != nil {
return nil, fmt.Errorf("goto page: %w", err)
}
log.Debugf("Url %s visited", task.URL)
if err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
State: playwright.LoadStateNetworkidle,
Timeout: pwDuration("5s"),
}); err != nil {
log.Warnf("waiting for page load: %v", err)
}
result = &models.TaskResult{}
result.Title, err = page.Title()
if err != nil {
return nil, fmt.Errorf("page title: %w", err)
}
iconUrl, err := page.Locator("link[rel=apple-touch-icon]").First().
GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")})
if err != nil {
log.Warnf("page icon url: %v", err)
} else {
result.Icon = absUrl(iconUrl, page)
}
posts, err := page.Locator(task.SelectorPost).All()
if err != nil {
return nil, fmt.Errorf("post locator: %w", err)
}
if len(posts) == 0 {
return nil, fmt.Errorf("no posts on page")
}
for _, post := range posts {
item, err := e.extractPost(task, post)
if err != nil {
log.Errorf("extract post fields: %v", err)
continue
}
if len(item.Title) == 0 || len(item.Link) == 0 {
log.Warnf("post has no required fields, skip")
continue
}
result.Items = append(result.Items, item)
}
if len(result.Items) == 0 {
return nil, fmt.Errorf("extract failed for all posts")
}
return result, nil
}
func (e *PwExtractor) extractPost(task models.Task, post playwright.Locator) (models.FeedItem, error) {
fieldIdx := 0
must := func(s string, err error) string {
fieldIdx++
if err != nil {
log.Errorf("extract post field %d: %v", fieldIdx, err)
return ""
}
log.Debugf("field=%d res=%.100s", fieldIdx, s)
return s
}
var item models.FeedItem
const defTimeout = "100ms"
defOpt := playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)}
defOptAttr := playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)}
log.Debugf("---- POST: ----")
item.Title = must(post.Locator(task.SelectorTitle).First().TextContent(defOpt))
item.Link = must(post.Locator(task.SelectorLink).First().GetAttribute("href", defOptAttr))
page, _ := post.Page()
item.Link = absUrl(item.Link, page)
item.Description = must(post.Locator(task.SelectorDescription).First().TextContent(defOpt))
item.AuthorName = must(post.Locator(task.SelectorAuthor).First().TextContent(defOpt))
item.AuthorLink = must(post.Locator(task.SelectorAuthor).First().GetAttribute("href", defOptAttr))
item.AuthorLink = absUrl(item.AuthorLink, page)
item.Content = must(post.Locator(task.SelectorContent).First().TextContent(defOpt))
item.Enclosure = must(post.Locator(task.SelectorEnclosure).First().GetAttribute("src", defOptAttr))
createdDateStr := must(post.Locator(task.SelectorCreated).First().TextContent(defOpt))
log.Debugf("date=%s", createdDateStr)
createdDate, err := dateparser.Parse(nil, createdDateStr)
if err != nil {
log.Errorf("dateparser: %v", err)
} else {
item.Created = createdDate.Time
}
return item, nil
}
func absUrl(link string, page playwright.Page) string {
if strings.HasPrefix(link, "/") {
pageUrl, _ := url.Parse(page.URL())
link = fmt.Sprintf("%s://%s%s", pageUrl.Scheme, pageUrl.Host, link)
}
log.Debugf("link=%s", link)
return link
}
// pwDuration converts string like "10s" to milliseconds float64 pointer
// needed for Playwright timeouts (wtf? why they don't use normal Durations?)
func pwDuration(s string) *float64 {
dur, err := time.ParseDuration(s)
if err != nil {
panic(fmt.Errorf("failed to parse duration %s: %w", s, err))
}
f64 := float64(dur.Milliseconds())
return &f64
}