diff --git a/cmd/extractor/extractor.go b/cmd/extractor/extractor.go index b9f2bb1..ed3865e 100644 --- a/cmd/extractor/extractor.go +++ b/cmd/extractor/extractor.go @@ -75,7 +75,9 @@ func main() { } }() + start := time.Now() result, err := pwe.Extract(task) + log.Infof("Extract took %v ms", time.Since(start).Milliseconds()) if err != nil { log.Errorf("extract: %v", err) scrResult, err := pwe.Screenshot(task) diff --git a/internal/extractors/pwextractor/pageparser.go b/internal/extractors/pwextractor/pageparser.go new file mode 100644 index 0000000..b4a20e2 --- /dev/null +++ b/internal/extractors/pwextractor/pageparser.go @@ -0,0 +1,223 @@ +package pwextractor + +import ( + "context" + _ "embed" + "fmt" + "github.com/egor3f/rssalchemy/internal/models" + "github.com/labstack/gommon/log" + "github.com/playwright-community/playwright-go" +) + +// Timeouts +var ( + defTimeout = "100ms" +) + +type pageParser struct { + task models.Task + page playwright.Page + dateParser DateParser + + // next fields only for debugging. Shit code, to do better later + postIdx int + fieldIdx int +} + +func (p *pageParser) parse() (*models.TaskResult, error) { + var result models.TaskResult + var err error + + p.waitFullLoad() + + result.Title, err = p.page.Title() + if err != nil { + return nil, fmt.Errorf("page title: %w", err) + } + + iconUrl, err := p.page.Locator("link[rel=apple-touch-icon]").First(). + GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")}) + if err != nil { + log.Warnf("page icon url: %v", err) + } else { + result.Icon = absUrl(iconUrl, p.page) + } + + posts, err := p.page.Locator(p.task.SelectorPost).All() + if err != nil { + return nil, fmt.Errorf("post locator: %w", err) + } + if len(posts) == 0 { + return nil, fmt.Errorf("no posts on page") + } + log.Debugf("Posts count=%d", len(posts)) + + for _, post := range posts { + item, err := p.extractPost(post) + if err != nil { + log.Errorf("extract post fields: %v", err) + continue + } + if len(item.Title) == 0 || len(item.Link) == 0 || item.Created.IsZero() { + log.Warnf("post has no required fields, skip") + continue + } + result.Items = append(result.Items, item) + } + + return &result, nil +} + +func (p *pageParser) waitFullLoad() { + timeout := pwDuration("5s") + ctx, cancel := context.WithCancel(context.Background()) + + go func() { + err := p.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ + State: playwright.LoadStateNetworkidle, + Timeout: timeout, + }) + log.Debugf("WaitFor LoadState finished with %v", err) + cancel() + }() + go func() { + err := p.page.Locator(p.task.SelectorPost).Locator(p.task.SelectorTitle).Last().WaitFor( + playwright.LocatorWaitForOptions{ + State: playwright.WaitForSelectorStateVisible, + Timeout: timeout, + }, + ) + log.Debugf("WaitFor LOCATOR finished with %v", err) + cancel() + }() + + <-ctx.Done() +} + +func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) { + p.fieldIdx = 0 + p.postIdx++ + var item models.FeedItem + + item.Title = newLocator(post, p.task.SelectorTitle).First().InnerText() + log.Debugf("---- POST: %s ----", item.Title) + + item.Link = newLocator(post, p.task.SelectorLink).First().GetAttribute("href") + page, _ := post.Page() + item.Link = absUrl(item.Link, page) + + if len(p.task.SelectorDescription) > 0 { + item.Description = newLocator(post, p.task.SelectorDescription).First().InnerText() + } + + item.AuthorName = newLocator(post, p.task.SelectorAuthor).First().InnerText() + + item.AuthorLink = newLocator(post, p.task.SelectorAuthor).First().GetAttribute("href") + item.AuthorLink = absUrl(item.AuthorLink, page) + + if len(p.task.SelectorContent) > 0 { + item.Content = p.extractContent(post) + } + + item.Enclosure = newLocator(post, p.task.SelectorEnclosure).First().GetAttribute("src") + + createdDateStr := newLocator(post, p.task.SelectorCreated).First().InnerText() + log.Debugf("date=%s", createdDateStr) + createdDate, err := p.dateParser.ParseDate(createdDateStr) + if err != nil { + log.Errorf("dateparser: %v", err) + } else { + item.Created = createdDate + } + + return item, nil +} + +//go:embed extract_post.js +var extractPostScript string + +func (p *pageParser) extractContent(post playwright.Locator) string { + postContent := newLocator(post, p.task.SelectorContent) + result, err := postContent.Evaluate( + extractPostScript, + nil, + playwright.LocatorEvaluateOptions{Timeout: pwDuration("1s")}, + ) + if err != nil { + log.Errorf("extract post content: evaluate: %v", err) + return postContent.TextContent() + } + resString, ok := result.(string) + if !ok { + log.Errorf("extract post content: result type mismatch: %v", result) + } + return resString +} + +type locator struct { + selector string + playwright.Locator +} + +func newLocator(parent playwright.Locator, selector string) *locator { + return &locator{ + selector: selector, + Locator: parent.Locator(selector), + } +} + +func (l *locator) String() string { + return l.selector +} + +func (l *locator) checkVisible() bool { + visible, err := l.IsVisible() + if err != nil { + log.Errorf("locator %s isVisible: %v", l, err) + return false + } + if !visible { + log.Warnf("locator %s is not visible", l) + } + return visible +} + +func (l *locator) First() *locator { + return &locator{l.selector, l.Locator.First()} +} + +func (l *locator) InnerText() string { + if !l.checkVisible() { + return "" + } + t, err := l.Locator.InnerText(playwright.LocatorInnerTextOptions{Timeout: pwDuration(defTimeout)}) + if err != nil { + log.Errorf("locator %s innerText: %v", l, err) + return "" + } + return t +} + +func (l *locator) GetAttribute(name string) string { + if !l.checkVisible() { + return "" + } + t, err := l.Locator.GetAttribute(name, playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)}) + if err != nil { + log.Errorf("locator %s getAttribute %s: %v", l, name, err) + return "" + } + return t +} + +func (l *locator) TextContent() string { + if !l.checkVisible() { + return "" + } + t, err := l.Locator.TextContent(playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)}) + if err != nil { + log.Errorf("locator %s textContent: %v", l, err) + return "" + } + return t +} diff --git a/internal/extractors/pwextractor/pwextractor.go b/internal/extractors/pwextractor/pwextractor.go index 9856282..141b5d8 100644 --- a/internal/extractors/pwextractor/pwextractor.go +++ b/internal/extractors/pwextractor/pwextractor.go @@ -1,8 +1,6 @@ package pwextractor import ( - "context" - _ "embed" "fmt" "github.com/egor3f/rssalchemy/internal/models" "github.com/labstack/gommon/log" @@ -12,14 +10,6 @@ import ( "time" ) -// Timeouts -var ( - defTimeout = "50ms" - defOptInText = playwright.LocatorInnerTextOptions{Timeout: pwDuration(defTimeout)} - defOptTextCon = playwright.LocatorTextContentOptions{Timeout: pwDuration(defTimeout)} - defOptAttr = playwright.LocatorGetAttributeOptions{Timeout: pwDuration(defTimeout)} -) - var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36" var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"` @@ -229,156 +219,3 @@ func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTas }) return } - -type pageParser struct { - task models.Task - page playwright.Page - dateParser DateParser - - // next fields only for debugging. Shit code, to do better later - postIdx int - fieldIdx int -} - -// must accepts arbitrary string and error and returns just string, also logs everything. -// it is used for playwright functons that return both string and error to avoid boilerplate. -// fieldIdx is convinient variable used only for logging purposes, looks like shit, maybe i'll do it better later. -func (p *pageParser) must(s string, err error) string { - p.fieldIdx++ - if err != nil { - log.Errorf("extract post field %d: %v", p.fieldIdx, err) - return "" - } - //log.Debugf("field=%d res=%.100s", p.fieldIdx, s) - return s -} - -func (p *pageParser) parse() (*models.TaskResult, error) { - var result models.TaskResult - var err error - - p.waitFullLoad() - - result.Title, err = p.page.Title() - if err != nil { - return nil, fmt.Errorf("page title: %w", err) - } - - iconUrl, err := p.page.Locator("link[rel=apple-touch-icon]").First(). - GetAttribute("href", playwright.LocatorGetAttributeOptions{Timeout: pwDuration("100ms")}) - if err != nil { - log.Warnf("page icon url: %v", err) - } else { - result.Icon = absUrl(iconUrl, p.page) - } - - posts, err := p.page.Locator(p.task.SelectorPost).All() - if err != nil { - return nil, fmt.Errorf("post locator: %w", err) - } - if len(posts) == 0 { - return nil, fmt.Errorf("no posts on page") - } - log.Debugf("Posts count=%d", len(posts)) - - for _, post := range posts { - item, err := p.extractPost(post) - if err != nil { - log.Errorf("extract post fields: %v", err) - continue - } - if len(item.Title) == 0 || len(item.Link) == 0 || item.Created.IsZero() { - log.Warnf("post has no required fields, skip") - continue - } - result.Items = append(result.Items, item) - } - - return &result, nil -} - -func (p *pageParser) waitFullLoad() { - timeout := pwDuration("5s") - ctx, cancel := context.WithCancel(context.Background()) - - go func() { - err := p.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ - State: playwright.LoadStateNetworkidle, - Timeout: timeout, - }) - log.Debugf("WaitFor LoadState finished with %v", err) - cancel() - }() - go func() { - err := p.page.Locator(p.task.SelectorPost).Locator(p.task.SelectorTitle).Last().WaitFor( - playwright.LocatorWaitForOptions{ - State: playwright.WaitForSelectorStateVisible, - Timeout: timeout, - }, - ) - log.Debugf("WaitFor LOCATOR finished with %v", err) - cancel() - }() - - <-ctx.Done() -} - -func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) { - p.fieldIdx = 0 - p.postIdx++ - var item models.FeedItem - - item.Title = p.must(post.Locator(p.task.SelectorTitle).First().InnerText(defOptInText)) - log.Debugf("---- POST: %s ----", item.Title) - - item.Link = p.must(post.Locator(p.task.SelectorLink).First().GetAttribute("href", defOptAttr)) - page, _ := post.Page() - item.Link = absUrl(item.Link, page) - - if len(p.task.SelectorDescription) > 0 { - item.Description = p.must(post.Locator(p.task.SelectorDescription).First().InnerText(defOptInText)) - } - - item.AuthorName = p.must(post.Locator(p.task.SelectorAuthor).First().InnerText(defOptInText)) - - item.AuthorLink = p.must(post.Locator(p.task.SelectorAuthor).First().GetAttribute("href", defOptAttr)) - item.AuthorLink = absUrl(item.AuthorLink, page) - - if len(p.task.SelectorContent) > 0 { - item.Content = p.extractContent(post) - } - - item.Enclosure = p.must(post.Locator(p.task.SelectorEnclosure).First().GetAttribute("src", defOptAttr)) - - createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText)) - log.Debugf("date=%s", createdDateStr) - createdDate, err := p.dateParser.ParseDate(createdDateStr) - if err != nil { - log.Errorf("dateparser: %v", err) - } else { - item.Created = createdDate - } - - return item, nil -} - -//go:embed extract_post.js -var extractPostScript string - -func (p *pageParser) extractContent(post playwright.Locator) string { - postContent := post.Locator(p.task.SelectorContent) - result, err := postContent.Evaluate( - extractPostScript, - nil, - playwright.LocatorEvaluateOptions{Timeout: pwDuration("1s")}, - ) - if err != nil { - log.Errorf("extract post content: evaluate: %v", err) - return p.must(postContent.TextContent(defOptTextCon)) - } - resString, ok := result.(string) - if !ok { - log.Errorf("extract post content: result type mismatch: %v", result) - } - return resString -}