214 lines
5.0 KiB
Go

package pwextractor
import (
"context"
_ "embed"
"fmt"
"github.com/egor3f/rssalchemy/internal/models"
"github.com/labstack/gommon/log"
"github.com/playwright-community/playwright-go"
)
// Timeouts
var (
defTimeout = "100ms"
)
type pageParser struct {
task models.Task
page playwright.Page
dateParser DateParser
// next fields only for debugging. Shit code, to do better later
postIdx int
fieldIdx int
}
func (p *pageParser) parse() (*models.TaskResult, error) {
var result models.TaskResult
var err error
p.waitFullLoad()
result.Title, err = p.page.Title()
if err != nil {
return nil, fmt.Errorf("page title: %w", err)
}
iconUrl, err := p.page.Locator("link[rel=apple-touch-icon]").First().
GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")})
if err != nil {
log.Warnf("page icon url: %v", err)
} else {
result.Icon = absUrl(iconUrl, p.page)
}
posts, err := p.page.Locator(p.task.SelectorPost).All()
if err != nil {
return nil, fmt.Errorf("post locator: %w", err)
}
if len(posts) == 0 {
return nil, fmt.Errorf("no posts on page")
}
log.Debugf("Posts count=%d", len(posts))
for _, post := range posts {
item, err := p.extractPost(post)
if err != nil {
log.Errorf("extract post fields: %v", err)
continue
}
if len(item.Title) == 0 || len(item.Link) == 0 || item.Created.IsZero() {
log.Warnf("post has no required fields, skip")
continue
}
result.Items = append(result.Items, item)
}
return &result, nil
}
func (p *pageParser) waitFullLoad() {
timeout := pwDuration("5s")
ctx, cancel := context.WithCancel(context.Background())
go func() {
err := p.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
State: playwright.LoadStateNetworkidle,
Timeout: timeout,
})
log.Debugf("WaitFor LoadState finished with %v", err)
cancel()
}()
<-ctx.Done()
}
func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) {
p.fieldIdx = 0
p.postIdx++
var item models.FeedItem
item.Title = newLocator(post, p.task.SelectorTitle).First().InnerText()
log.Debugf("---- POST: %s ----", item.Title)
item.Link = newLocator(post, p.task.SelectorLink).First().GetAttribute("href")
page, _ := post.Page()
item.Link = absUrl(item.Link, page)
if len(p.task.SelectorDescription) > 0 {
item.Description = newLocator(post, p.task.SelectorDescription).First().InnerText()
}
item.AuthorName = newLocator(post, p.task.SelectorAuthor).First().InnerText()
item.AuthorLink = newLocator(post, p.task.SelectorAuthor).First().GetAttribute("href")
item.AuthorLink = absUrl(item.AuthorLink, page)
if len(p.task.SelectorContent) > 0 {
item.Content = p.extractContent(post)
}
item.Enclosure = newLocator(post, p.task.SelectorEnclosure).First().GetAttribute("src")
createdDateStr := newLocator(post, p.task.SelectorCreated).First().InnerText()
log.Debugf("date=%s", createdDateStr)
createdDate, err := p.dateParser.ParseDate(createdDateStr)
if err != nil {
log.Errorf("dateparser: %v", err)
} else {
item.Created = createdDate
}
return item, nil
}
//go:embed extract_post.js
var extractPostScript string
func (p *pageParser) extractContent(post playwright.Locator) string {
postContent := newLocator(post, p.task.SelectorContent)
result, err := postContent.Evaluate(
extractPostScript,
nil,
playwright.LocatorEvaluateOptions{Timeout: pwDuration("1s")},
)
if err != nil {
log.Errorf("extract post content: evaluate: %v", err)
return postContent.TextContent()
}
resString, ok := result.(string)
if !ok {
log.Errorf("extract post content: result type mismatch: %v", result)
}
return resString
}
type locator struct {
selector string
playwright.Locator
}
func newLocator(parent playwright.Locator, selector string) *locator {
return &locator{
selector: selector,
Locator: parent.Locator(selector),
}
}
func (l *locator) String() string {
return l.selector
}
func (l *locator) checkVisible() bool {
visible, err := l.IsVisible()
if err != nil {
log.Errorf("locator %s isVisible: %v", l, err)
return false
}
if !visible {
log.Warnf("locator %s is not visible", l)
}
return visible
}
func (l *locator) First() *locator {
return &locator{l.selector, l.Locator.First()}
}
func (l *locator) InnerText() string {
if !l.checkVisible() {
return ""
}
t, err := l.Locator.InnerText(playwright.LocatorInnerTextOptions{Timeout: pwDuration(defTimeout)})
if err != nil {
log.Errorf("locator %s innerText: %v", l, err)
return ""
}
return t
}
func (l *locator) GetAttribute(name string) string {
if !l.checkVisible() {
return ""
}
t, err := l.Locator.GetAttribute(name, playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)})
if err != nil {
log.Errorf("locator %s getAttribute %s: %v", l, name, err)
return ""
}
return t
}
func (l *locator) TextContent() string {
if !l.checkVisible() {
return ""
}
t, err := l.Locator.TextContent(playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)})
if err != nil {
log.Errorf("locator %s textContent: %v", l, err)
return ""
}
return t
}