extractor rewrite (use custom wrappers over playwright locator)
This commit is contained in:
parent
aab8c026f4
commit
72ec41dccf
@ -75,7 +75,9 @@ func main() {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
result, err := pwe.Extract(task)
|
result, err := pwe.Extract(task)
|
||||||
|
log.Infof("Extract took %v ms", time.Since(start).Milliseconds())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("extract: %v", err)
|
log.Errorf("extract: %v", err)
|
||||||
scrResult, err := pwe.Screenshot(task)
|
scrResult, err := pwe.Screenshot(task)
|
||||||
|
|||||||
223
internal/extractors/pwextractor/pageparser.go
Normal file
223
internal/extractors/pwextractor/pageparser.go
Normal file
@ -0,0 +1,223 @@
|
|||||||
|
package pwextractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
_ "embed"
|
||||||
|
"fmt"
|
||||||
|
"github.com/egor3f/rssalchemy/internal/models"
|
||||||
|
"github.com/labstack/gommon/log"
|
||||||
|
"github.com/playwright-community/playwright-go"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Timeouts
|
||||||
|
var (
|
||||||
|
defTimeout = "100ms"
|
||||||
|
)
|
||||||
|
|
||||||
|
type pageParser struct {
|
||||||
|
task models.Task
|
||||||
|
page playwright.Page
|
||||||
|
dateParser DateParser
|
||||||
|
|
||||||
|
// next fields only for debugging. Shit code, to do better later
|
||||||
|
postIdx int
|
||||||
|
fieldIdx int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pageParser) parse() (*models.TaskResult, error) {
|
||||||
|
var result models.TaskResult
|
||||||
|
var err error
|
||||||
|
|
||||||
|
p.waitFullLoad()
|
||||||
|
|
||||||
|
result.Title, err = p.page.Title()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("page title: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
iconUrl, err := p.page.Locator("link[rel=apple-touch-icon]").First().
|
||||||
|
GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")})
|
||||||
|
if err != nil {
|
||||||
|
log.Warnf("page icon url: %v", err)
|
||||||
|
} else {
|
||||||
|
result.Icon = absUrl(iconUrl, p.page)
|
||||||
|
}
|
||||||
|
|
||||||
|
posts, err := p.page.Locator(p.task.SelectorPost).All()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("post locator: %w", err)
|
||||||
|
}
|
||||||
|
if len(posts) == 0 {
|
||||||
|
return nil, fmt.Errorf("no posts on page")
|
||||||
|
}
|
||||||
|
log.Debugf("Posts count=%d", len(posts))
|
||||||
|
|
||||||
|
for _, post := range posts {
|
||||||
|
item, err := p.extractPost(post)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("extract post fields: %v", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(item.Title) == 0 || len(item.Link) == 0 || item.Created.IsZero() {
|
||||||
|
log.Warnf("post has no required fields, skip")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.Items = append(result.Items, item)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pageParser) waitFullLoad() {
|
||||||
|
timeout := pwDuration("5s")
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
err := p.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||||
|
State: playwright.LoadStateNetworkidle,
|
||||||
|
Timeout: timeout,
|
||||||
|
})
|
||||||
|
log.Debugf("WaitFor LoadState finished with %v", err)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
go func() {
|
||||||
|
err := p.page.Locator(p.task.SelectorPost).Locator(p.task.SelectorTitle).Last().WaitFor(
|
||||||
|
playwright.LocatorWaitForOptions{
|
||||||
|
State: playwright.WaitForSelectorStateVisible,
|
||||||
|
Timeout: timeout,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
log.Debugf("WaitFor LOCATOR finished with %v", err)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
<-ctx.Done()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) {
|
||||||
|
p.fieldIdx = 0
|
||||||
|
p.postIdx++
|
||||||
|
var item models.FeedItem
|
||||||
|
|
||||||
|
item.Title = newLocator(post, p.task.SelectorTitle).First().InnerText()
|
||||||
|
log.Debugf("---- POST: %s ----", item.Title)
|
||||||
|
|
||||||
|
item.Link = newLocator(post, p.task.SelectorLink).First().GetAttribute("href")
|
||||||
|
page, _ := post.Page()
|
||||||
|
item.Link = absUrl(item.Link, page)
|
||||||
|
|
||||||
|
if len(p.task.SelectorDescription) > 0 {
|
||||||
|
item.Description = newLocator(post, p.task.SelectorDescription).First().InnerText()
|
||||||
|
}
|
||||||
|
|
||||||
|
item.AuthorName = newLocator(post, p.task.SelectorAuthor).First().InnerText()
|
||||||
|
|
||||||
|
item.AuthorLink = newLocator(post, p.task.SelectorAuthor).First().GetAttribute("href")
|
||||||
|
item.AuthorLink = absUrl(item.AuthorLink, page)
|
||||||
|
|
||||||
|
if len(p.task.SelectorContent) > 0 {
|
||||||
|
item.Content = p.extractContent(post)
|
||||||
|
}
|
||||||
|
|
||||||
|
item.Enclosure = newLocator(post, p.task.SelectorEnclosure).First().GetAttribute("src")
|
||||||
|
|
||||||
|
createdDateStr := newLocator(post, p.task.SelectorCreated).First().InnerText()
|
||||||
|
log.Debugf("date=%s", createdDateStr)
|
||||||
|
createdDate, err := p.dateParser.ParseDate(createdDateStr)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("dateparser: %v", err)
|
||||||
|
} else {
|
||||||
|
item.Created = createdDate
|
||||||
|
}
|
||||||
|
|
||||||
|
return item, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
//go:embed extract_post.js
|
||||||
|
var extractPostScript string
|
||||||
|
|
||||||
|
func (p *pageParser) extractContent(post playwright.Locator) string {
|
||||||
|
postContent := newLocator(post, p.task.SelectorContent)
|
||||||
|
result, err := postContent.Evaluate(
|
||||||
|
extractPostScript,
|
||||||
|
nil,
|
||||||
|
playwright.LocatorEvaluateOptions{Timeout: pwDuration("1s")},
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("extract post content: evaluate: %v", err)
|
||||||
|
return postContent.TextContent()
|
||||||
|
}
|
||||||
|
resString, ok := result.(string)
|
||||||
|
if !ok {
|
||||||
|
log.Errorf("extract post content: result type mismatch: %v", result)
|
||||||
|
}
|
||||||
|
return resString
|
||||||
|
}
|
||||||
|
|
||||||
|
type locator struct {
|
||||||
|
selector string
|
||||||
|
playwright.Locator
|
||||||
|
}
|
||||||
|
|
||||||
|
func newLocator(parent playwright.Locator, selector string) *locator {
|
||||||
|
return &locator{
|
||||||
|
selector: selector,
|
||||||
|
Locator: parent.Locator(selector),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *locator) String() string {
|
||||||
|
return l.selector
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *locator) checkVisible() bool {
|
||||||
|
visible, err := l.IsVisible()
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("locator %s isVisible: %v", l, err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if !visible {
|
||||||
|
log.Warnf("locator %s is not visible", l)
|
||||||
|
}
|
||||||
|
return visible
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *locator) First() *locator {
|
||||||
|
return &locator{l.selector, l.Locator.First()}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *locator) InnerText() string {
|
||||||
|
if !l.checkVisible() {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
t, err := l.Locator.InnerText(playwright.LocatorInnerTextOptions{Timeout: pwDuration(defTimeout)})
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("locator %s innerText: %v", l, err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return t
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *locator) GetAttribute(name string) string {
|
||||||
|
if !l.checkVisible() {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
t, err := l.Locator.GetAttribute(name, playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)})
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("locator %s getAttribute %s: %v", l, name, err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return t
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *locator) TextContent() string {
|
||||||
|
if !l.checkVisible() {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
t, err := l.Locator.TextContent(playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)})
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("locator %s textContent: %v", l, err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return t
|
||||||
|
}
|
||||||
@ -1,8 +1,6 @@
|
|||||||
package pwextractor
|
package pwextractor
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
_ "embed"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/egor3f/rssalchemy/internal/models"
|
"github.com/egor3f/rssalchemy/internal/models"
|
||||||
"github.com/labstack/gommon/log"
|
"github.com/labstack/gommon/log"
|
||||||
@ -12,14 +10,6 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Timeouts
|
|
||||||
var (
|
|
||||||
defTimeout = "50ms"
|
|
||||||
defOptInText = playwright.LocatorInnerTextOptions{Timeout: pwDuration(defTimeout)}
|
|
||||||
defOptTextCon = playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)}
|
|
||||||
defOptAttr = playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)}
|
|
||||||
)
|
|
||||||
|
|
||||||
var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
|
var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
|
||||||
var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"`
|
var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"`
|
||||||
|
|
||||||
@ -229,156 +219,3 @@ func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTas
|
|||||||
})
|
})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
type pageParser struct {
|
|
||||||
task models.Task
|
|
||||||
page playwright.Page
|
|
||||||
dateParser DateParser
|
|
||||||
|
|
||||||
// next fields only for debugging. Shit code, to do better later
|
|
||||||
postIdx int
|
|
||||||
fieldIdx int
|
|
||||||
}
|
|
||||||
|
|
||||||
// must accepts arbitrary string and error and returns just string, also logs everything.
|
|
||||||
// it is used for playwright functons that return both string and error to avoid boilerplate.
|
|
||||||
// fieldIdx is convinient variable used only for logging purposes, looks like shit, maybe i'll do it better later.
|
|
||||||
func (p *pageParser) must(s string, err error) string {
|
|
||||||
p.fieldIdx++
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("extract post field %d: %v", p.fieldIdx, err)
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
//log.Debugf("field=%d res=%.100s", p.fieldIdx, s)
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *pageParser) parse() (*models.TaskResult, error) {
|
|
||||||
var result models.TaskResult
|
|
||||||
var err error
|
|
||||||
|
|
||||||
p.waitFullLoad()
|
|
||||||
|
|
||||||
result.Title, err = p.page.Title()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("page title: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
iconUrl, err := p.page.Locator("link[rel=apple-touch-icon]").First().
|
|
||||||
GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")})
|
|
||||||
if err != nil {
|
|
||||||
log.Warnf("page icon url: %v", err)
|
|
||||||
} else {
|
|
||||||
result.Icon = absUrl(iconUrl, p.page)
|
|
||||||
}
|
|
||||||
|
|
||||||
posts, err := p.page.Locator(p.task.SelectorPost).All()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("post locator: %w", err)
|
|
||||||
}
|
|
||||||
if len(posts) == 0 {
|
|
||||||
return nil, fmt.Errorf("no posts on page")
|
|
||||||
}
|
|
||||||
log.Debugf("Posts count=%d", len(posts))
|
|
||||||
|
|
||||||
for _, post := range posts {
|
|
||||||
item, err := p.extractPost(post)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("extract post fields: %v", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if len(item.Title) == 0 || len(item.Link) == 0 || item.Created.IsZero() {
|
|
||||||
log.Warnf("post has no required fields, skip")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
result.Items = append(result.Items, item)
|
|
||||||
}
|
|
||||||
|
|
||||||
return &result, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *pageParser) waitFullLoad() {
|
|
||||||
timeout := pwDuration("5s")
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
err := p.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
|
||||||
State: playwright.LoadStateNetworkidle,
|
|
||||||
Timeout: timeout,
|
|
||||||
})
|
|
||||||
log.Debugf("WaitFor LoadState finished with %v", err)
|
|
||||||
cancel()
|
|
||||||
}()
|
|
||||||
go func() {
|
|
||||||
err := p.page.Locator(p.task.SelectorPost).Locator(p.task.SelectorTitle).Last().WaitFor(
|
|
||||||
playwright.LocatorWaitForOptions{
|
|
||||||
State: playwright.WaitForSelectorStateVisible,
|
|
||||||
Timeout: timeout,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
log.Debugf("WaitFor LOCATOR finished with %v", err)
|
|
||||||
cancel()
|
|
||||||
}()
|
|
||||||
|
|
||||||
<-ctx.Done()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) {
|
|
||||||
p.fieldIdx = 0
|
|
||||||
p.postIdx++
|
|
||||||
var item models.FeedItem
|
|
||||||
|
|
||||||
item.Title = p.must(post.Locator(p.task.SelectorTitle).First().InnerText(defOptInText))
|
|
||||||
log.Debugf("---- POST: %s ----", item.Title)
|
|
||||||
|
|
||||||
item.Link = p.must(post.Locator(p.task.SelectorLink).First().GetAttribute("href", defOptAttr))
|
|
||||||
page, _ := post.Page()
|
|
||||||
item.Link = absUrl(item.Link, page)
|
|
||||||
|
|
||||||
if len(p.task.SelectorDescription) > 0 {
|
|
||||||
item.Description = p.must(post.Locator(p.task.SelectorDescription).First().InnerText(defOptInText))
|
|
||||||
}
|
|
||||||
|
|
||||||
item.AuthorName = p.must(post.Locator(p.task.SelectorAuthor).First().InnerText(defOptInText))
|
|
||||||
|
|
||||||
item.AuthorLink = p.must(post.Locator(p.task.SelectorAuthor).First().GetAttribute("href", defOptAttr))
|
|
||||||
item.AuthorLink = absUrl(item.AuthorLink, page)
|
|
||||||
|
|
||||||
if len(p.task.SelectorContent) > 0 {
|
|
||||||
item.Content = p.extractContent(post)
|
|
||||||
}
|
|
||||||
|
|
||||||
item.Enclosure = p.must(post.Locator(p.task.SelectorEnclosure).First().GetAttribute("src", defOptAttr))
|
|
||||||
|
|
||||||
createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText))
|
|
||||||
log.Debugf("date=%s", createdDateStr)
|
|
||||||
createdDate, err := p.dateParser.ParseDate(createdDateStr)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("dateparser: %v", err)
|
|
||||||
} else {
|
|
||||||
item.Created = createdDate
|
|
||||||
}
|
|
||||||
|
|
||||||
return item, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
//go:embed extract_post.js
|
|
||||||
var extractPostScript string
|
|
||||||
|
|
||||||
func (p *pageParser) extractContent(post playwright.Locator) string {
|
|
||||||
postContent := post.Locator(p.task.SelectorContent)
|
|
||||||
result, err := postContent.Evaluate(
|
|
||||||
extractPostScript,
|
|
||||||
nil,
|
|
||||||
playwright.LocatorEvaluateOptions{Timeout: pwDuration("1s")},
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("extract post content: evaluate: %v", err)
|
|
||||||
return p.must(postContent.TextContent(defOptTextCon))
|
|
||||||
}
|
|
||||||
resString, ok := result.(string)
|
|
||||||
if !ok {
|
|
||||||
log.Errorf("extract post content: result type mismatch: %v", result)
|
|
||||||
}
|
|
||||||
return resString
|
|
||||||
}
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user