From 1cfc3ca0a74ffc463a239b8c86fb6100959a8ba3 Mon Sep 17 00:00:00 2001 From: Egor Aristov Date: Fri, 17 Jan 2025 13:21:19 +0300 Subject: [PATCH] extractor (refactoring and enhancements) --- .gitignore | 1 + .../extractors/pwextractor/extract_post.js | 62 ++++++++ .../extractors/pwextractor/pwextractor.go | 139 +++++++++++------- internal/extractors/pwextractor/utils.go | 32 ++++ 4 files changed, 183 insertions(+), 51 deletions(-) create mode 100644 internal/extractors/pwextractor/extract_post.js create mode 100644 internal/extractors/pwextractor/utils.go diff --git a/.gitignore b/.gitignore index 85e7c1d..5d88c84 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /.idea/ +/trash/ diff --git a/internal/extractors/pwextractor/extract_post.js b/internal/extractors/pwextractor/extract_post.js new file mode 100644 index 0000000..c946634 --- /dev/null +++ b/internal/extractors/pwextractor/extract_post.js @@ -0,0 +1,62 @@ +// let fnc = // for autocomplete +el => { + let content = ""; + let paragraph = ""; + + const finishParagraph = () => { + content += "

" + paragraph + "

"; + paragraph = ""; + } + + const addImage = img => { + let imgSrc = img.getAttribute('src'); + if (imgSrc.startsWith('/')) { + imgSrc = `${document.location.origin}/${imgSrc}`; + } + content += ``; + }; + + let traverse = (node) => { + // node = document.getRootNode(); // for autocomplete + + if (node.childNodes.length === 0) { + return + } + + for (let child of node.childNodes) { + switch (child.nodeType) { + case child.ELEMENT_NODE: + // child = document.getElementById(''); // for autocomplete + + let tag = child.tagName.toLowerCase(); + + const allowedMarkupTags = ['b', 'i', 'strong']; + if (allowedMarkupTags.includes(tag)) { + paragraph += `<${tag}>` + } + + if (tag === 'img') { + finishParagraph(); + addImage(child); + break; + } + + traverse(child); + + if (allowedMarkupTags.includes(tag)) { + paragraph += `` + } + + break; + case child.TEXT_NODE: + if (child.nodeValue.length > 0) { + paragraph += child.nodeValue + " "; + } + break; + } + } + }; + + traverse(el); + return content; +} diff --git a/internal/extractors/pwextractor/pwextractor.go b/internal/extractors/pwextractor/pwextractor.go index 86d62b3..f925c61 100644 --- a/internal/extractors/pwextractor/pwextractor.go +++ b/internal/extractors/pwextractor/pwextractor.go @@ -1,14 +1,21 @@ package pwextractor import ( + _ "embed" "fmt" "github.com/egor3f/rssalchemy/internal/models" "github.com/labstack/gommon/log" "github.com/markusmobius/go-dateparser" "github.com/playwright-community/playwright-go" - "net/url" - "strings" - "time" +) + +// Timeouts +var ( + defTimeout = "100ms" + defOptInText = playwright.LocatorInnerTextOptions{Timeout: pwDuration(defTimeout)} + defOptTextCon = playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)} + defOptAttr = playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)} + defOptEval = playwright.LocatorEvaluateOptions{Timeout: pwDuration(defTimeout)} ) type PwExtractor struct { @@ -69,30 +76,71 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR log.Warnf("waiting for page load: %v", err) } - result = &models.TaskResult{} + parser := pageParser{ + task: task, + page: page, + } - result.Title, err = page.Title() + result, err = parser.parse() + if err != nil { + return nil, fmt.Errorf("parse page: %w", err) + } + if len(result.Items) == 0 { + return nil, fmt.Errorf("extract failed for all posts") + } + + return result, nil +} + +type pageParser struct { + task models.Task + page playwright.Page + + // next fields only for debugging. Shit code, to do better later + postIdx int + fieldIdx int +} + +// must accepts arbitrary string and error and returns just string, also logs everything. +// it is used for playwright functons that return both string and error to avoid boilerplate. +// fieldIdx is convinient variable used only for logging purposes, looks like shit, maybe i'll do it better later. +func (p *pageParser) must(s string, err error) string { + p.fieldIdx++ + if err != nil { + log.Errorf("extract post field %d: %v", p.fieldIdx, err) + return "" + } + //log.Debugf("field=%d res=%.100s", p.fieldIdx, s) + return s +} + +func (p *pageParser) parse() (*models.TaskResult, error) { + var result models.TaskResult + var err error + + result.Title, err = p.page.Title() if err != nil { return nil, fmt.Errorf("page title: %w", err) } - iconUrl, err := page.Locator("link[rel=apple-touch-icon]").First(). + iconUrl, err := p.page.Locator("link[rel=apple-touch-icon]").First(). GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")}) if err != nil { log.Warnf("page icon url: %v", err) } else { - result.Icon = absUrl(iconUrl, page) + result.Icon = absUrl(iconUrl, p.page) } - posts, err := page.Locator(task.SelectorPost).All() + posts, err := p.page.Locator(p.task.SelectorPost).All() if err != nil { return nil, fmt.Errorf("post locator: %w", err) } if len(posts) == 0 { return nil, fmt.Errorf("no posts on page") } + for _, post := range posts { - item, err := e.extractPost(task, post) + item, err := p.extractPost(post) if err != nil { log.Errorf("extract post fields: %v", err) continue @@ -103,48 +151,34 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR } result.Items = append(result.Items, item) } - if len(result.Items) == 0 { - return nil, fmt.Errorf("extract failed for all posts") - } - return result, nil + return &result, nil } -func (e *PwExtractor) extractPost(task models.Task, post playwright.Locator) (models.FeedItem, error) { - fieldIdx := 0 - must := func(s string, err error) string { - fieldIdx++ - if err != nil { - log.Errorf("extract post field %d: %v", fieldIdx, err) - return "" - } - log.Debugf("field=%d res=%.100s", fieldIdx, s) - return s - } +func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) { + p.fieldIdx = 0 + p.postIdx++ var item models.FeedItem - const defTimeout = "100ms" - defOpt := playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)} - defOptAttr := playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)} - log.Debugf("---- POST: ----") - item.Title = must(post.Locator(task.SelectorTitle).First().TextContent(defOpt)) + item.Title = p.must(post.Locator(p.task.SelectorTitle).First().InnerText(defOptInText)) + log.Debugf("---- POST: %s ----", item.Title) - item.Link = must(post.Locator(task.SelectorLink).First().GetAttribute("href", defOptAttr)) + item.Link = p.must(post.Locator(p.task.SelectorLink).First().GetAttribute("href", defOptAttr)) page, _ := post.Page() item.Link = absUrl(item.Link, page) - item.Description = must(post.Locator(task.SelectorDescription).First().TextContent(defOpt)) + item.Description = p.must(post.Locator(p.task.SelectorDescription).First().InnerText(defOptInText)) - item.AuthorName = must(post.Locator(task.SelectorAuthor).First().TextContent(defOpt)) + item.AuthorName = p.must(post.Locator(p.task.SelectorAuthor).First().InnerText(defOptInText)) - item.AuthorLink = must(post.Locator(task.SelectorAuthor).First().GetAttribute("href", defOptAttr)) + item.AuthorLink = p.must(post.Locator(p.task.SelectorAuthor).First().GetAttribute("href", defOptAttr)) item.AuthorLink = absUrl(item.AuthorLink, page) - item.Content = must(post.Locator(task.SelectorContent).First().TextContent(defOpt)) + item.Content = p.extractContent(post) - item.Enclosure = must(post.Locator(task.SelectorEnclosure).First().GetAttribute("src", defOptAttr)) + item.Enclosure = p.must(post.Locator(p.task.SelectorEnclosure).First().GetAttribute("src", defOptAttr)) - createdDateStr := must(post.Locator(task.SelectorCreated).First().TextContent(defOpt)) + createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText)) log.Debugf("date=%s", createdDateStr) createdDate, err := dateparser.Parse(nil, createdDateStr) if err != nil { @@ -153,25 +187,28 @@ func (e *PwExtractor) extractPost(task models.Task, post playwright.Locator) (mo item.Created = createdDate.Time } + log.Debugf("---- END POST: %s ----", item.Title) + return item, nil } -func absUrl(link string, page playwright.Page) string { - if strings.HasPrefix(link, "/") { - pageUrl, _ := url.Parse(page.URL()) - link = fmt.Sprintf("%s://%s%s", pageUrl.Scheme, pageUrl.Host, link) - } - log.Debugf("link=%s", link) - return link -} +//go:embed extract_post.js +var extractPostScript string -// pwDuration converts string like "10s" to milliseconds float64 pointer -// needed for Playwright timeouts (wtf? why they don't use normal Durations?) -func pwDuration(s string) *float64 { - dur, err := time.ParseDuration(s) +func (p *pageParser) extractContent(post playwright.Locator) string { + postContent := post.Locator(p.task.SelectorContent) + result, err := postContent.Evaluate( + extractPostScript, + nil, + playwright.LocatorEvaluateOptions{Timeout: pwDuration("1s")}, + ) if err != nil { - panic(fmt.Errorf("failed to parse duration %s: %w", s, err)) + log.Errorf("extract post content: evaluate: %v", err) + return p.must(postContent.TextContent(defOptTextCon)) } - f64 := float64(dur.Milliseconds()) - return &f64 + resString, ok := result.(string) + if !ok { + log.Errorf("extract post content: result type mismatch: %v", result) + } + return resString } diff --git a/internal/extractors/pwextractor/utils.go b/internal/extractors/pwextractor/utils.go new file mode 100644 index 0000000..431874f --- /dev/null +++ b/internal/extractors/pwextractor/utils.go @@ -0,0 +1,32 @@ +package pwextractor + +import ( + "fmt" + "github.com/playwright-community/playwright-go" + "net/url" + "strings" + "time" +) + +func absUrl(link string, page playwright.Page) string { + if len(link) == 0 { + return "" + } + if strings.HasPrefix(link, "/") { + pageUrl, _ := url.Parse(page.URL()) + link = fmt.Sprintf("%s://%s%s", pageUrl.Scheme, pageUrl.Host, link) + } + //log.Debugf("link=%s", link) + return link +} + +// pwDuration converts string like "10s" to milliseconds float64 pointer +// needed for Playwright timeouts (wtf? why they don't use normal Durations?) +func pwDuration(s string) *float64 { + dur, err := time.ParseDuration(s) + if err != nil { + panic(fmt.Errorf("failed to parse duration %s: %w", s, err)) + } + f64 := float64(dur.Milliseconds()) + return &f64 +}