speed up loading by waiting for EITHER locator OR networkIdle
This commit is contained in:
parent
87ceeb4376
commit
155cb37735
@ -10,7 +10,7 @@ import (
|
|||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
log.SetLevel(log.DEBUG)
|
log.SetLevel(log.DEBUG)
|
||||||
log.SetHeader(`${level}`)
|
log.SetHeader(`${time_rfc3339_nano} ${level}`)
|
||||||
|
|
||||||
// this code is temporary!
|
// this code is temporary!
|
||||||
// todo: rewrite not to use hardcoded tasks
|
// todo: rewrite not to use hardcoded tasks
|
||||||
|
|||||||
3
go.mod
3
go.mod
@ -12,6 +12,7 @@ require (
|
|||||||
github.com/markusmobius/go-dateparser v1.2.3
|
github.com/markusmobius/go-dateparser v1.2.3
|
||||||
github.com/nats-io/nats.go v1.38.0
|
github.com/nats-io/nats.go v1.38.0
|
||||||
github.com/playwright-community/playwright-go v0.4901.0
|
github.com/playwright-community/playwright-go v0.4901.0
|
||||||
|
github.com/yassinebenaid/godump v0.11.1
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
@ -38,10 +39,10 @@ require (
|
|||||||
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
||||||
github.com/valyala/fasttemplate v1.2.2 // indirect
|
github.com/valyala/fasttemplate v1.2.2 // indirect
|
||||||
github.com/wasilibs/go-re2 v1.3.0 // indirect
|
github.com/wasilibs/go-re2 v1.3.0 // indirect
|
||||||
github.com/yassinebenaid/godump v0.11.1 // indirect
|
|
||||||
golang.org/x/crypto v0.32.0 // indirect
|
golang.org/x/crypto v0.32.0 // indirect
|
||||||
golang.org/x/exp v0.0.0-20220321173239-a90fa8a75705 // indirect
|
golang.org/x/exp v0.0.0-20220321173239-a90fa8a75705 // indirect
|
||||||
golang.org/x/net v0.34.0 // indirect
|
golang.org/x/net v0.34.0 // indirect
|
||||||
|
golang.org/x/sync v0.10.0 // indirect
|
||||||
golang.org/x/sys v0.29.0 // indirect
|
golang.org/x/sys v0.29.0 // indirect
|
||||||
golang.org/x/text v0.21.0 // indirect
|
golang.org/x/text v0.21.0 // indirect
|
||||||
golang.org/x/time v0.8.0 // indirect
|
golang.org/x/time v0.8.0 // indirect
|
||||||
|
|||||||
2
go.sum
2
go.sum
@ -109,6 +109,8 @@ golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
|
|||||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
|
||||||
|
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
package pwextractor
|
package pwextractor
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
_ "embed"
|
_ "embed"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/egor3f/rssalchemy/internal/config"
|
"github.com/egor3f/rssalchemy/internal/config"
|
||||||
@ -75,13 +76,6 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
|
|||||||
}
|
}
|
||||||
log.Debugf("Url %s visited", task.URL)
|
log.Debugf("Url %s visited", task.URL)
|
||||||
|
|
||||||
if err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
|
||||||
State: playwright.LoadStateNetworkidle,
|
|
||||||
Timeout: pwDuration("5s"),
|
|
||||||
}); err != nil {
|
|
||||||
log.Warnf("waiting for page load: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
parser := pageParser{
|
parser := pageParser{
|
||||||
task: task,
|
task: task,
|
||||||
page: page,
|
page: page,
|
||||||
@ -124,6 +118,8 @@ func (p *pageParser) parse() (*models.TaskResult, error) {
|
|||||||
var result models.TaskResult
|
var result models.TaskResult
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
p.waitFullLoad()
|
||||||
|
|
||||||
result.Title, err = p.page.Title()
|
result.Title, err = p.page.Title()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("page title: %w", err)
|
return nil, fmt.Errorf("page title: %w", err)
|
||||||
@ -144,6 +140,7 @@ func (p *pageParser) parse() (*models.TaskResult, error) {
|
|||||||
if len(posts) == 0 {
|
if len(posts) == 0 {
|
||||||
return nil, fmt.Errorf("no posts on page")
|
return nil, fmt.Errorf("no posts on page")
|
||||||
}
|
}
|
||||||
|
log.Debugf("Posts count=%d", len(posts))
|
||||||
|
|
||||||
for _, post := range posts {
|
for _, post := range posts {
|
||||||
item, err := p.extractPost(post)
|
item, err := p.extractPost(post)
|
||||||
@ -161,6 +158,33 @@ func (p *pageParser) parse() (*models.TaskResult, error) {
|
|||||||
return &result, nil
|
return &result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p *pageParser) waitFullLoad() {
|
||||||
|
timeout := pwDuration("5s")
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
err := p.page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||||
|
State: playwright.LoadStateNetworkidle,
|
||||||
|
Timeout: timeout,
|
||||||
|
})
|
||||||
|
log.Debugf("WaitFor LoadState finished with %v", err)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
go func() {
|
||||||
|
err := p.page.Locator(p.task.SelectorPost).Locator(p.task.SelectorTitle).Last().WaitFor(
|
||||||
|
playwright.LocatorWaitForOptions{
|
||||||
|
State: playwright.WaitForSelectorStateVisible,
|
||||||
|
Timeout: timeout,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
log.Debugf("WaitFor LOCATOR finished with %v", err)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
<-ctx.Done()
|
||||||
|
}
|
||||||
|
|
||||||
func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) {
|
func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, error) {
|
||||||
p.fieldIdx = 0
|
p.fieldIdx = 0
|
||||||
p.postIdx++
|
p.postIdx++
|
||||||
@ -193,8 +217,6 @@ func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, erro
|
|||||||
item.Created = createdDate.Time
|
item.Created = createdDate.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debugf("---- END POST: %s ----", item.Title)
|
|
||||||
|
|
||||||
return item, nil
|
return item, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user