This commit is contained in:
Egor Aristov 2025-03-22 11:53:13 +03:00
parent fbf2c9be50
commit 1b91f55fd0
Signed by: egor3f
GPG Key ID: 40482A264AAEC85F
6 changed files with 117408 additions and 38 deletions

2
go.mod
View File

@ -78,9 +78,11 @@ require (
github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/nats-io/nkeys v0.4.9 // indirect github.com/nats-io/nkeys v0.4.9 // indirect
github.com/nats-io/nuid v1.0.1 // indirect github.com/nats-io/nuid v1.0.1 // indirect
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc // indirect
github.com/pkg/errors v0.9.1 // indirect github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 // indirect github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 // indirect
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6 // indirect
github.com/tetratelabs/wazero v1.2.1 // indirect github.com/tetratelabs/wazero v1.2.1 // indirect
github.com/thanhpk/randstr v1.0.4 // indirect github.com/thanhpk/randstr v1.0.4 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect github.com/valyala/bytebufferpool v1.0.0 // indirect

4
go.sum
View File

@ -305,6 +305,8 @@ github.com/onsi/gomega v1.16.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAl
github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY=
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc h1:bkt157sOZooZKI1vnYwDWsIg4aU9GHIIych2fPT7QLw=
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc/go.mod h1:RzOK93vWuT8huhjtK41YJc5tpEW7pn5sHKgo77xZg6E=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
@ -336,6 +338,8 @@ github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUz
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 h1:AJNDS0kP60X8wwWFvbLPwDuojxubj9pbfK7pjHw0vKg= github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 h1:AJNDS0kP60X8wwWFvbLPwDuojxubj9pbfK7pjHw0vKg=
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E= github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E=
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6 h1:3vph+oCRtCUuedn0i28o6tNmXYV9JC6WW+opu1As8xI=
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6/go.mod h1:+p9xynCxNcbR3dUwpBXbfyWATZc53NujWurfrsOgdQk=
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I=
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=

View File

@ -0,0 +1,49 @@
package pwextractor
import (
_ "embed"
"errors"
"fmt"
"github.com/labstack/gommon/log"
"github.com/scrapinghub/adblockgoparser"
"net/url"
"strings"
)
//go:embed blocklists/easylist.txt
var easyList string
//go:embed blocklists/easyprivacy.txt
var easyPrivacy string
var ruleSet *adblockgoparser.RuleSet
func init() {
ruleSet = adblockgoparser.CreateRuleSet()
for _, list := range []string{easyList, easyPrivacy} {
for _, rec := range strings.Split(list, "\n") {
rule, err := adblockgoparser.ParseRule(rec)
if err != nil {
if !errors.Is(err, adblockgoparser.ErrSkipComment) &&
!errors.Is(err, adblockgoparser.ErrUnsupportedRule) &&
!errors.Is(err, adblockgoparser.ErrSkipHTML) &&
!errors.Is(err, adblockgoparser.ErrEmptyLine) {
panic(fmt.Sprintf("Adblock rule parse: %v", err))
}
continue
}
ruleSet.AddRule(rule)
}
}
}
func allowAdblock(url *url.URL) bool {
req := adblockgoparser.Request{
URL: url,
}
allow := ruleSet.Allow(&req)
if !allow {
log.Infof("Adblock blocked %s", url.String())
}
return allow
}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -10,6 +10,7 @@ import (
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"maps" "maps"
"net" "net"
"net/url"
"strings" "strings"
"time" "time"
) )
@ -33,6 +34,8 @@ type PwExtractor struct {
cookieManager CookieManager cookieManager CookieManager
limiter limiter.Limiter limiter limiter.Limiter
proxyIP net.IP proxyIP net.IP
allowed int
blocked int
} }
type Config struct { type Config struct {
@ -145,42 +148,8 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
} }
}() }()
if err := bCtx.Route("**", func(route playwright.Route) { if err := e.setupInterceptors(bCtx); err != nil {
log.Debugf("Route: %s", route.Request().URL()) return fmt.Errorf("setup interceptors: %w", err)
allowHost, err := e.allowHost(route.Request().URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
if allowHost {
if err := route.Continue(); err != nil {
log.Warnf("Route continue error: %v", err)
}
} else {
if err := route.Abort(); err != nil {
log.Warnf("Route abort error: %v", err)
}
}
}); err != nil {
return fmt.Errorf("set route: %w", err)
}
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
log.Debugf("Websocket route: %s", route.URL())
allowHost, err := e.allowHost(route.URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
if allowHost {
if _, err := route.ConnectToServer(); err != nil {
log.Warnf("Websocket connect error: %v", err)
}
} else {
route.Close()
}
}); err != nil {
return fmt.Errorf("websocket set route: %w", err)
} }
if len(cookies) > 0 { if len(cookies) > 0 {
@ -231,7 +200,13 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
start := time.Now() start := time.Now()
err = cb(page) err = cb(page)
log.Infof("Visiting page %s finished, time=%f secs, err=%v", task.URL, time.Since(start).Seconds(), err) log.Infof(
"Visiting page %s finished, time=%f secs, allowed hosts=%d, blocked hosts=%d, err=%v",
task.URL,
time.Since(start).Seconds(),
e.allowed, e.blocked,
err,
)
if len(cookies) > 0 { if len(cookies) > 0 {
bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain)) bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain))
@ -252,10 +227,67 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
return err return err
} }
func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error {
if err := bCtx.Route("**", func(route playwright.Route) {
log.Debugf("Route: %s", route.Request().URL())
allowHost, err := e.allowHost(route.Request().URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
URL, err := url.Parse(route.Request().URL())
if err != nil {
log.Errorf("Interceptor parse url: %v", err)
allowHost = false
}
allowHost = allowHost && allowAdblock(URL)
if allowHost {
e.allowed++
if err := route.Continue(); err != nil {
log.Warnf("Route continue error: %v", err)
}
} else {
e.blocked++
if err := route.Abort(); err != nil {
log.Warnf("Route abort error: %v", err)
}
}
}); err != nil {
return fmt.Errorf("set route: %w", err)
}
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
log.Debugf("Websocket route: %s", route.URL())
allowHost, err := e.allowHost(route.URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
URL, err := url.Parse(route.URL())
if err != nil {
log.Errorf("Interceptor websocket parse url: %v", err)
allowHost = false
}
allowHost = allowHost && allowAdblock(URL)
if allowHost {
e.allowed++
if _, err := route.ConnectToServer(); err != nil {
log.Warnf("Websocket connect error: %v", err)
}
} else {
e.blocked++
route.Close()
}
}); err != nil {
return fmt.Errorf("websocket set route: %w", err)
}
return nil
}
func (e *PwExtractor) allowHost(rawUrl string) (bool, error) { func (e *PwExtractor) allowHost(rawUrl string) (bool, error) {
ips, err := getIPs(rawUrl) ips, err := getIPs(rawUrl)
if err != nil { if err != nil {
return false, fmt.Errorf("allow host get ips: %w", err) return false, fmt.Errorf("allowAdblock host get ips: %w", err)
} }
for _, ip := range ips { for _, ip := range ips {
deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast() deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast()