extractor (refactoring and enhancements)
This commit is contained in:
parent
0abcf66fad
commit
1cfc3ca0a7
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
/.idea/
|
/.idea/
|
||||||
|
/trash/
|
||||||
|
|||||||
62
internal/extractors/pwextractor/extract_post.js
Normal file
62
internal/extractors/pwextractor/extract_post.js
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
// let fnc = // for autocomplete
|
||||||
|
el => {
|
||||||
|
let content = "";
|
||||||
|
let paragraph = "";
|
||||||
|
|
||||||
|
const finishParagraph = () => {
|
||||||
|
content += "<p>" + paragraph + "</p>";
|
||||||
|
paragraph = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
const addImage = img => {
|
||||||
|
let imgSrc = img.getAttribute('src');
|
||||||
|
if (imgSrc.startsWith('/')) {
|
||||||
|
imgSrc = `${document.location.origin}/${imgSrc}`;
|
||||||
|
}
|
||||||
|
content += `<img src="${imgSrc}"/>`;
|
||||||
|
};
|
||||||
|
|
||||||
|
let traverse = (node) => {
|
||||||
|
// node = document.getRootNode(); // for autocomplete
|
||||||
|
|
||||||
|
if (node.childNodes.length === 0) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let child of node.childNodes) {
|
||||||
|
switch (child.nodeType) {
|
||||||
|
case child.ELEMENT_NODE:
|
||||||
|
// child = document.getElementById(''); // for autocomplete
|
||||||
|
|
||||||
|
let tag = child.tagName.toLowerCase();
|
||||||
|
|
||||||
|
const allowedMarkupTags = ['b', 'i', 'strong'];
|
||||||
|
if (allowedMarkupTags.includes(tag)) {
|
||||||
|
paragraph += `<${tag}>`
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tag === 'img') {
|
||||||
|
finishParagraph();
|
||||||
|
addImage(child);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
traverse(child);
|
||||||
|
|
||||||
|
if (allowedMarkupTags.includes(tag)) {
|
||||||
|
paragraph += `</${tag}>`
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
case child.TEXT_NODE:
|
||||||
|
if (child.nodeValue.length > 0) {
|
||||||
|
paragraph += child.nodeValue + " ";
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
traverse(el);
|
||||||
|
return content;
|
||||||
|
}
|
||||||
@ -1,14 +1,21 @@
|
|||||||
package pwextractor
|
package pwextractor
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
_ "embed"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/egor3f/rssalchemy/internal/models"
|
"github.com/egor3f/rssalchemy/internal/models"
|
||||||
"github.com/labstack/gommon/log"
|
"github.com/labstack/gommon/log"
|
||||||
"github.com/markusmobius/go-dateparser"
|
"github.com/markusmobius/go-dateparser"
|
||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
"net/url"
|
)
|
||||||
"strings"
|
|
||||||
"time"
|
// Timeouts
|
||||||
|
var (
|
||||||
|
defTimeout = "100ms"
|
||||||
|
defOptInText = playwright.LocatorInnerTextOptions{Timeout: pwDuration(defTimeout)}
|
||||||
|
defOptTextCon = playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)}
|
||||||
|
defOptAttr = playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)}
|
||||||
|
defOptEval = playwright.LocatorEvaluateOptions{Timeout: pwDuration(defTimeout)}
|
||||||
)
|
)
|
||||||
|
|
||||||
type PwExtractor struct {
|
type PwExtractor struct {
|
||||||
@ -69,30 +76,71 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
|
|||||||
log.Warnf("waiting for page load: %v", err)
|
log.Warnf("waiting for page load: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
result = &models.TaskResult{}
|
parser := pageParser{
|
||||||
|
task: task,
|
||||||
|
page: page,
|
||||||
|
}
|
||||||
|
|
||||||
result.Title, err = page.Title()
|
result, err = parser.parse()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("parse page: %w", err)
|
||||||
|
}
|
||||||
|
if len(result.Items) == 0 {
|
||||||
|
return nil, fmt.Errorf("extract failed for all posts")
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type pageParser struct {
|
||||||
|
task models.Task
|
||||||
|
page playwright.Page
|
||||||
|
|
||||||
|
// next fields only for debugging. Shit code, to do better later
|
||||||
|
postIdx int
|
||||||
|
fieldIdx int
|
||||||
|
}
|
||||||
|
|
||||||
|
// must accepts arbitrary string and error and returns just string, also logs everything.
|
||||||
|
// it is used for playwright functons that return both string and error to avoid boilerplate.
|
||||||
|
// fieldIdx is convinient variable used only for logging purposes, looks like shit, maybe i'll do it better later.
|
||||||
|
func (p *pageParser) must(s string, err error) string {
|
||||||
|
p.fieldIdx++
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("extract post field %d: %v", p.fieldIdx, err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
//log.Debugf("field=%d res=%.100s", p.fieldIdx, s)
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pageParser) parse() (*models.TaskResult, error) {
|
||||||
|
var result models.TaskResult
|
||||||
|
var err error
|
||||||
|
|
||||||
|
result.Title, err = p.page.Title()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("page title: %w", err)
|
return nil, fmt.Errorf("page title: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
iconUrl, err := page.Locator("link[rel=apple-touch-icon]").First().
|
iconUrl, err := p.page.Locator("link[rel=apple-touch-icon]").First().
|
||||||
GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")})
|
GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warnf("page icon url: %v", err)
|
log.Warnf("page icon url: %v", err)
|
||||||
} else {
|
} else {
|
||||||
result.Icon = absUrl(iconUrl, page)
|
result.Icon = absUrl(iconUrl, p.page)
|
||||||
}
|
}
|
||||||
|
|
||||||
posts, err := page.Locator(task.SelectorPost).All()
|
posts, err := p.page.Locator(p.task.SelectorPost).All()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("post locator: %w", err)
|
return nil, fmt.Errorf("post locator: %w", err)
|
||||||
}
|
}
|
||||||
if len(posts) == 0 {
|
if len(posts) == 0 {
|
||||||
return nil, fmt.Errorf("no posts on page")
|
return nil, fmt.Errorf("no posts on page")
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, post := range posts {
|
for _, post := range posts {
|
||||||
item, err := e.extractPost(task, post)
|
item, err := p.extractPost(post)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("extract post fields: %v", err)
|
log.Errorf("extract post fields: %v", err)
|
||||||
continue
|
continue
|
||||||
@ -103,48 +151,34 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
|
|||||||
}
|
}
|
||||||
result.Items = append(result.Items, item)
|
result.Items = append(result.Items, item)
|
||||||
}
|
}
|
||||||
if len(result.Items) == 0 {
|
|
||||||
return nil, fmt.Errorf("extract failed for all posts")
|
return &result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return result, nil
|
func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) {
|
||||||
}
|
p.fieldIdx = 0
|
||||||
|
p.postIdx++
|
||||||
func (e *PwExtractor) extractPost(task models.Task, post playwright.Locator) (models.FeedItem, error) {
|
|
||||||
fieldIdx := 0
|
|
||||||
must := func(s string, err error) string {
|
|
||||||
fieldIdx++
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("extract post field %d: %v", fieldIdx, err)
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
log.Debugf("field=%d res=%.100s", fieldIdx, s)
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
var item models.FeedItem
|
var item models.FeedItem
|
||||||
const defTimeout = "100ms"
|
|
||||||
defOpt := playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)}
|
|
||||||
defOptAttr := playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)}
|
|
||||||
log.Debugf("---- POST: ----")
|
|
||||||
|
|
||||||
item.Title = must(post.Locator(task.SelectorTitle).First().TextContent(defOpt))
|
item.Title = p.must(post.Locator(p.task.SelectorTitle).First().InnerText(defOptInText))
|
||||||
|
log.Debugf("---- POST: %s ----", item.Title)
|
||||||
|
|
||||||
item.Link = must(post.Locator(task.SelectorLink).First().GetAttribute("href", defOptAttr))
|
item.Link = p.must(post.Locator(p.task.SelectorLink).First().GetAttribute("href", defOptAttr))
|
||||||
page, _ := post.Page()
|
page, _ := post.Page()
|
||||||
item.Link = absUrl(item.Link, page)
|
item.Link = absUrl(item.Link, page)
|
||||||
|
|
||||||
item.Description = must(post.Locator(task.SelectorDescription).First().TextContent(defOpt))
|
item.Description = p.must(post.Locator(p.task.SelectorDescription).First().InnerText(defOptInText))
|
||||||
|
|
||||||
item.AuthorName = must(post.Locator(task.SelectorAuthor).First().TextContent(defOpt))
|
item.AuthorName = p.must(post.Locator(p.task.SelectorAuthor).First().InnerText(defOptInText))
|
||||||
|
|
||||||
item.AuthorLink = must(post.Locator(task.SelectorAuthor).First().GetAttribute("href", defOptAttr))
|
item.AuthorLink = p.must(post.Locator(p.task.SelectorAuthor).First().GetAttribute("href", defOptAttr))
|
||||||
item.AuthorLink = absUrl(item.AuthorLink, page)
|
item.AuthorLink = absUrl(item.AuthorLink, page)
|
||||||
|
|
||||||
item.Content = must(post.Locator(task.SelectorContent).First().TextContent(defOpt))
|
item.Content = p.extractContent(post)
|
||||||
|
|
||||||
item.Enclosure = must(post.Locator(task.SelectorEnclosure).First().GetAttribute("src", defOptAttr))
|
item.Enclosure = p.must(post.Locator(p.task.SelectorEnclosure).First().GetAttribute("src", defOptAttr))
|
||||||
|
|
||||||
createdDateStr := must(post.Locator(task.SelectorCreated).First().TextContent(defOpt))
|
createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText))
|
||||||
log.Debugf("date=%s", createdDateStr)
|
log.Debugf("date=%s", createdDateStr)
|
||||||
createdDate, err := dateparser.Parse(nil, createdDateStr)
|
createdDate, err := dateparser.Parse(nil, createdDateStr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -153,25 +187,28 @@ func (e *PwExtractor) extractPost(task models.Task, post playwright.Locator) (mo
|
|||||||
item.Created = createdDate.Time
|
item.Created = createdDate.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.Debugf("---- END POST: %s ----", item.Title)
|
||||||
|
|
||||||
return item, nil
|
return item, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func absUrl(link string, page playwright.Page) string {
|
//go:embed extract_post.js
|
||||||
if strings.HasPrefix(link, "/") {
|
var extractPostScript string
|
||||||
pageUrl, _ := url.Parse(page.URL())
|
|
||||||
link = fmt.Sprintf("%s://%s%s", pageUrl.Scheme, pageUrl.Host, link)
|
|
||||||
}
|
|
||||||
log.Debugf("link=%s", link)
|
|
||||||
return link
|
|
||||||
}
|
|
||||||
|
|
||||||
// pwDuration converts string like "10s" to milliseconds float64 pointer
|
func (p *pageParser) extractContent(post playwright.Locator) string {
|
||||||
// needed for Playwright timeouts (wtf? why they don't use normal Durations?)
|
postContent := post.Locator(p.task.SelectorContent)
|
||||||
func pwDuration(s string) *float64 {
|
result, err := postContent.Evaluate(
|
||||||
dur, err := time.ParseDuration(s)
|
extractPostScript,
|
||||||
|
nil,
|
||||||
|
playwright.LocatorEvaluateOptions{Timeout: pwDuration("1s")},
|
||||||
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(fmt.Errorf("failed to parse duration %s: %w", s, err))
|
log.Errorf("extract post content: evaluate: %v", err)
|
||||||
|
return p.must(postContent.TextContent(defOptTextCon))
|
||||||
}
|
}
|
||||||
f64 := float64(dur.Milliseconds())
|
resString, ok := result.(string)
|
||||||
return &f64
|
if !ok {
|
||||||
|
log.Errorf("extract post content: result type mismatch: %v", result)
|
||||||
|
}
|
||||||
|
return resString
|
||||||
}
|
}
|
||||||
|
|||||||
32
internal/extractors/pwextractor/utils.go
Normal file
32
internal/extractors/pwextractor/utils.go
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
package pwextractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"github.com/playwright-community/playwright-go"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func absUrl(link string, page playwright.Page) string {
|
||||||
|
if len(link) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(link, "/") {
|
||||||
|
pageUrl, _ := url.Parse(page.URL())
|
||||||
|
link = fmt.Sprintf("%s://%s%s", pageUrl.Scheme, pageUrl.Host, link)
|
||||||
|
}
|
||||||
|
//log.Debugf("link=%s", link)
|
||||||
|
return link
|
||||||
|
}
|
||||||
|
|
||||||
|
// pwDuration converts string like "10s" to milliseconds float64 pointer
|
||||||
|
// needed for Playwright timeouts (wtf? why they don't use normal Durations?)
|
||||||
|
func pwDuration(s string) *float64 {
|
||||||
|
dur, err := time.ParseDuration(s)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("failed to parse duration %s: %w", s, err))
|
||||||
|
}
|
||||||
|
f64 := float64(dur.Milliseconds())
|
||||||
|
return &f64
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user