diff --git a/cmd/extractor/extractor.go b/cmd/extractor/extractor.go index 0dfd5ea..41159cd 100644 --- a/cmd/extractor/extractor.go +++ b/cmd/extractor/extractor.go @@ -10,7 +10,7 @@ import ( func main() { log.SetLevel(log.DEBUG) - log.SetHeader(`${level}`) + log.SetHeader(`${time_rfc3339_nano} ${level}`) // this code is temporary! // todo: rewrite not to use hardcoded tasks diff --git a/go.mod b/go.mod index 278906c..9d08717 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/markusmobius/go-dateparser v1.2.3 github.com/nats-io/nats.go v1.38.0 github.com/playwright-community/playwright-go v0.4901.0 + github.com/yassinebenaid/godump v0.11.1 ) require ( @@ -38,10 +39,10 @@ require ( github.com/valyala/bytebufferpool v1.0.0 // indirect github.com/valyala/fasttemplate v1.2.2 // indirect github.com/wasilibs/go-re2 v1.3.0 // indirect - github.com/yassinebenaid/godump v0.11.1 // indirect golang.org/x/crypto v0.32.0 // indirect golang.org/x/exp v0.0.0-20220321173239-a90fa8a75705 // indirect golang.org/x/net v0.34.0 // indirect + golang.org/x/sync v0.10.0 // indirect golang.org/x/sys v0.29.0 // indirect golang.org/x/text v0.21.0 // indirect golang.org/x/time v0.8.0 // indirect diff --git a/go.sum b/go.sum index d725b36..dc2ed57 100644 --- a/go.sum +++ b/go.sum @@ -109,6 +109,8 @@ golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/internal/extractors/pwextractor/pwextractor.go b/internal/extractors/pwextractor/pwextractor.go index 78e2cc9..d6ecf27 100644 --- a/internal/extractors/pwextractor/pwextractor.go +++ b/internal/extractors/pwextractor/pwextractor.go @@ -1,6 +1,7 @@ package pwextractor import ( + "context" _ "embed" "fmt" "github.com/egor3f/rssalchemy/internal/config" @@ -75,13 +76,6 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR } log.Debugf("Url %s visited", task.URL) - if err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ - State: playwright.LoadStateNetworkidle, - Timeout: pwDuration("5s"), - }); err != nil { - log.Warnf("waiting for page load: %v", err) - } - parser := pageParser{ task: task, page: page, @@ -124,6 +118,8 @@ func (p *pageParser) parse() (*models.TaskResult, error) { var result models.TaskResult var err error + p.waitFullLoad() + result.Title, err = p.page.Title() if err != nil { return nil, fmt.Errorf("page title: %w", err) @@ -144,6 +140,7 @@ func (p *pageParser) parse() (*models.TaskResult, error) { if len(posts) == 0 { return nil, fmt.Errorf("no posts on page") } + log.Debugf("Posts count=%d", len(posts)) for _, post := range posts { item, err := p.extractPost(post) @@ -161,6 +158,33 @@ func (p *pageParser) parse() (*models.TaskResult, error) { return &result, nil } +func (p *pageParser) waitFullLoad() { + timeout := pwDuration("5s") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go func() { + err := p.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ + State: playwright.LoadStateNetworkidle, + Timeout: timeout, + }) + log.Debugf("WaitFor LoadState finished with %v", err) + cancel() + }() + go func() { + err := p.page.Locator(p.task.SelectorPost).Locator(p.task.SelectorTitle).Last().WaitFor( + playwright.LocatorWaitForOptions{ + State: playwright.WaitForSelectorStateVisible, + Timeout: timeout, + }, + ) + log.Debugf("WaitFor LOCATOR finished with %v", err) + cancel() + }() + + <-ctx.Done() +} + func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) { p.fieldIdx = 0 p.postIdx++ @@ -193,8 +217,6 @@ func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, erro item.Created = createdDate.Time } - log.Debugf("---- END POST: %s ----", item.Title) - return item, nil }