This commit is contained in:
Egor Aristov 2025-03-23 00:59:17 +03:00
parent fbf2c9be50
commit 893bc78ba8
Signed by: egor3f
GPG Key ID: 40482A264AAEC85F
6 changed files with 117417 additions and 38 deletions

2
go.mod
View File

@ -15,8 +15,10 @@ require (
github.com/markusmobius/go-dateparser v1.2.3 github.com/markusmobius/go-dateparser v1.2.3
github.com/mennanov/limiters v1.11.0 github.com/mennanov/limiters v1.11.0
github.com/nats-io/nats.go v1.38.0 github.com/nats-io/nats.go v1.38.0
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc
github.com/playwright-community/playwright-go v0.5001.0 github.com/playwright-community/playwright-go v0.5001.0
github.com/redis/go-redis/v9 v9.7.0 github.com/redis/go-redis/v9 v9.7.0
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6
github.com/srikrsna/protoc-gen-gotag v1.0.2 github.com/srikrsna/protoc-gen-gotag v1.0.2
github.com/stretchr/testify v1.10.0 github.com/stretchr/testify v1.10.0
golang.org/x/time v0.8.0 golang.org/x/time v0.8.0

4
go.sum
View File

@ -305,6 +305,8 @@ github.com/onsi/gomega v1.16.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAl
github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY=
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc h1:bkt157sOZooZKI1vnYwDWsIg4aU9GHIIych2fPT7QLw=
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc/go.mod h1:RzOK93vWuT8huhjtK41YJc5tpEW7pn5sHKgo77xZg6E=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
@ -336,6 +338,8 @@ github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUz
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 h1:AJNDS0kP60X8wwWFvbLPwDuojxubj9pbfK7pjHw0vKg= github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 h1:AJNDS0kP60X8wwWFvbLPwDuojxubj9pbfK7pjHw0vKg=
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E= github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E=
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6 h1:3vph+oCRtCUuedn0i28o6tNmXYV9JC6WW+opu1As8xI=
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6/go.mod h1:+p9xynCxNcbR3dUwpBXbfyWATZc53NujWurfrsOgdQk=
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I=
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=

View File

@ -0,0 +1,58 @@
package pwextractor
import (
"bytes"
_ "embed"
"fmt"
"github.com/labstack/gommon/log"
"github.com/patriciy/adblock/adblock"
"github.com/playwright-community/playwright-go"
"net/url"
"sync"
)
//go:embed blocklists/easylist.txt
var easyList []byte
//go:embed blocklists/easyprivacy.txt
var easyPrivacy []byte
var matcher *adblock.RuleMatcher
var matcherMu sync.Mutex
func init() {
allBuf := bytes.Buffer{}
allBuf.Write(easyList)
//allBuf.Write(easyPrivacy)
r := bytes.NewReader(allBuf.Bytes())
rules, err := adblock.ParseRules(r)
if err != nil {
panic(fmt.Sprintf("Parse rules: %v", err))
}
matcher = adblock.NewMatcher()
for i, rule := range rules {
if err := matcher.AddRule(rule, i); err != nil {
panic(fmt.Sprintf("Add rule: %v", err))
}
}
}
func allowAdblock(url *url.URL) bool {
matcherMu.Lock()
defer matcherMu.Unlock()
req := adblock.Request{
URL: url.String(),
Domain: url.Host,
GenericBlock: playwright.Bool(false),
}
deny, ruleId, err := matcher.Match(&req)
if err != nil {
log.Errorf("Adblock error: %v (url %s)", err, url.String())
return true
}
if deny {
log.Infof("Adblock blocked %s (ruleId %d)", url.String(), ruleId)
}
return !deny
}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -10,6 +10,7 @@ import (
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"maps" "maps"
"net" "net"
"net/url"
"strings" "strings"
"time" "time"
) )
@ -33,6 +34,8 @@ type PwExtractor struct {
cookieManager CookieManager cookieManager CookieManager
limiter limiter.Limiter limiter limiter.Limiter
proxyIP net.IP proxyIP net.IP
allowed int
blocked int
} }
type Config struct { type Config struct {
@ -145,42 +148,8 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
} }
}() }()
if err := bCtx.Route("**", func(route playwright.Route) { if err := e.setupInterceptors(bCtx); err != nil {
log.Debugf("Route: %s", route.Request().URL()) return fmt.Errorf("setup interceptors: %w", err)
allowHost, err := e.allowHost(route.Request().URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
if allowHost {
if err := route.Continue(); err != nil {
log.Warnf("Route continue error: %v", err)
}
} else {
if err := route.Abort(); err != nil {
log.Warnf("Route abort error: %v", err)
}
}
}); err != nil {
return fmt.Errorf("set route: %w", err)
}
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
log.Debugf("Websocket route: %s", route.URL())
allowHost, err := e.allowHost(route.URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
if allowHost {
if _, err := route.ConnectToServer(); err != nil {
log.Warnf("Websocket connect error: %v", err)
}
} else {
route.Close()
}
}); err != nil {
return fmt.Errorf("websocket set route: %w", err)
} }
if len(cookies) > 0 { if len(cookies) > 0 {
@ -231,7 +200,13 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
start := time.Now() start := time.Now()
err = cb(page) err = cb(page)
log.Infof("Visiting page %s finished, time=%f secs, err=%v", task.URL, time.Since(start).Seconds(), err) log.Infof(
"Visiting page %s finished, time=%f secs, allowed hosts=%d, blocked hosts=%d, err=%v",
task.URL,
time.Since(start).Seconds(),
e.allowed, e.blocked,
err,
)
if len(cookies) > 0 { if len(cookies) > 0 {
bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain)) bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain))
@ -252,10 +227,67 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
return err return err
} }
func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error {
if err := bCtx.Route("**", func(route playwright.Route) {
log.Debugf("Route: %s", route.Request().URL())
allowHost, err := e.allowHost(route.Request().URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
URL, err := url.Parse(route.Request().URL())
if err != nil {
log.Errorf("Interceptor parse url: %v", err)
allowHost = false
}
allowHost = allowHost && allowAdblock(URL)
if allowHost {
e.allowed++
if err := route.Continue(); err != nil {
log.Warnf("Route continue error: %v", err)
}
} else {
e.blocked++
if err := route.Abort(); err != nil {
log.Warnf("Route abort error: %v", err)
}
}
}); err != nil {
return fmt.Errorf("set route: %w", err)
}
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
log.Debugf("Websocket route: %s", route.URL())
allowHost, err := e.allowHost(route.URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
URL, err := url.Parse(route.URL())
if err != nil {
log.Errorf("Interceptor websocket parse url: %v", err)
allowHost = false
}
allowHost = allowHost && allowAdblock(URL)
if allowHost {
e.allowed++
if _, err := route.ConnectToServer(); err != nil {
log.Warnf("Websocket connect error: %v", err)
}
} else {
e.blocked++
route.Close()
}
}); err != nil {
return fmt.Errorf("websocket set route: %w", err)
}
return nil
}
func (e *PwExtractor) allowHost(rawUrl string) (bool, error) { func (e *PwExtractor) allowHost(rawUrl string) (bool, error) {
ips, err := getIPs(rawUrl) ips, err := getIPs(rawUrl)
if err != nil { if err != nil {
return false, fmt.Errorf("allow host get ips: %w", err) return false, fmt.Errorf("allowAdblock host get ips: %w", err)
} }
for _, ip := range ips { for _, ip := range ips {
deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast() deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast()