adblock
This commit is contained in:
parent
fbf2c9be50
commit
8f12325b7c
2
go.mod
2
go.mod
@ -78,9 +78,11 @@ require (
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/nats-io/nkeys v0.4.9 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 // indirect
|
||||
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6 // indirect
|
||||
github.com/tetratelabs/wazero v1.2.1 // indirect
|
||||
github.com/thanhpk/randstr v1.0.4 // indirect
|
||||
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
||||
|
||||
4
go.sum
4
go.sum
@ -305,6 +305,8 @@ github.com/onsi/gomega v1.16.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAl
|
||||
github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
|
||||
github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY=
|
||||
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
|
||||
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc h1:bkt157sOZooZKI1vnYwDWsIg4aU9GHIIych2fPT7QLw=
|
||||
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc/go.mod h1:RzOK93vWuT8huhjtK41YJc5tpEW7pn5sHKgo77xZg6E=
|
||||
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
@ -336,6 +338,8 @@ github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUz
|
||||
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
|
||||
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 h1:AJNDS0kP60X8wwWFvbLPwDuojxubj9pbfK7pjHw0vKg=
|
||||
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E=
|
||||
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6 h1:3vph+oCRtCUuedn0i28o6tNmXYV9JC6WW+opu1As8xI=
|
||||
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6/go.mod h1:+p9xynCxNcbR3dUwpBXbfyWATZc53NujWurfrsOgdQk=
|
||||
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I=
|
||||
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
|
||||
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
||||
|
||||
49
internal/extractors/pwextractor/adblock.go
Normal file
49
internal/extractors/pwextractor/adblock.go
Normal file
@ -0,0 +1,49 @@
|
||||
package pwextractor
|
||||
|
||||
import (
|
||||
_ "embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/labstack/gommon/log"
|
||||
"github.com/scrapinghub/adblockgoparser"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed blocklists/easylist.txt
|
||||
var easyList string
|
||||
|
||||
//go:embed blocklists/easyprivacy.txt
|
||||
var easyPrivacy string
|
||||
|
||||
var ruleSet *adblockgoparser.RuleSet
|
||||
|
||||
func init() {
|
||||
ruleSet = adblockgoparser.CreateRuleSet()
|
||||
for _, list := range []string{easyList, easyPrivacy} {
|
||||
for _, rec := range strings.Split(list, "\n") {
|
||||
rule, err := adblockgoparser.ParseRule(rec)
|
||||
if err != nil {
|
||||
if !errors.Is(err, adblockgoparser.ErrSkipComment) &&
|
||||
!errors.Is(err, adblockgoparser.ErrUnsupportedRule) &&
|
||||
!errors.Is(err, adblockgoparser.ErrSkipHTML) &&
|
||||
!errors.Is(err, adblockgoparser.ErrEmptyLine) {
|
||||
panic(fmt.Sprintf("Adblock rule parse: %v", err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
ruleSet.AddRule(rule)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func allowAdblock(url *url.URL) bool {
|
||||
req := adblockgoparser.Request{
|
||||
URL: url,
|
||||
}
|
||||
allow := ruleSet.Allow(&req)
|
||||
if !allow {
|
||||
log.Infof("Adblock blocked %s", url.String())
|
||||
}
|
||||
return allow
|
||||
}
|
||||
63423
internal/extractors/pwextractor/blocklists/easylist.txt
Normal file
63423
internal/extractors/pwextractor/blocklists/easylist.txt
Normal file
File diff suppressed because it is too large
Load Diff
53860
internal/extractors/pwextractor/blocklists/easyprivacy.txt
Normal file
53860
internal/extractors/pwextractor/blocklists/easyprivacy.txt
Normal file
File diff suppressed because one or more lines are too long
@ -10,6 +10,7 @@ import (
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"maps"
|
||||
"net"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@ -33,6 +34,8 @@ type PwExtractor struct {
|
||||
cookieManager CookieManager
|
||||
limiter limiter.Limiter
|
||||
proxyIP net.IP
|
||||
allowed int
|
||||
blocked int
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
@ -145,42 +148,8 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
||||
}
|
||||
}()
|
||||
|
||||
if err := bCtx.Route("**", func(route playwright.Route) {
|
||||
log.Debugf("Route: %s", route.Request().URL())
|
||||
allowHost, err := e.allowHost(route.Request().URL())
|
||||
if err != nil {
|
||||
log.Errorf("Allow host: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
if allowHost {
|
||||
if err := route.Continue(); err != nil {
|
||||
log.Warnf("Route continue error: %v", err)
|
||||
}
|
||||
} else {
|
||||
if err := route.Abort(); err != nil {
|
||||
log.Warnf("Route abort error: %v", err)
|
||||
}
|
||||
}
|
||||
}); err != nil {
|
||||
return fmt.Errorf("set route: %w", err)
|
||||
}
|
||||
|
||||
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
|
||||
log.Debugf("Websocket route: %s", route.URL())
|
||||
allowHost, err := e.allowHost(route.URL())
|
||||
if err != nil {
|
||||
log.Errorf("Allow host: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
if allowHost {
|
||||
if _, err := route.ConnectToServer(); err != nil {
|
||||
log.Warnf("Websocket connect error: %v", err)
|
||||
}
|
||||
} else {
|
||||
route.Close()
|
||||
}
|
||||
}); err != nil {
|
||||
return fmt.Errorf("websocket set route: %w", err)
|
||||
if err := e.setupInterceptors(bCtx); err != nil {
|
||||
return fmt.Errorf("setup interceptors: %w", err)
|
||||
}
|
||||
|
||||
if len(cookies) > 0 {
|
||||
@ -231,7 +200,13 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
||||
|
||||
start := time.Now()
|
||||
err = cb(page)
|
||||
log.Infof("Visiting page %s finished, time=%f secs, err=%v", task.URL, time.Since(start).Seconds(), err)
|
||||
log.Infof(
|
||||
"Visiting page %s finished, time=%f secs, allowed hosts=%d, blocked hosts=%d, err=%v",
|
||||
task.URL,
|
||||
time.Since(start).Seconds(),
|
||||
e.allowed, e.blocked,
|
||||
err,
|
||||
)
|
||||
|
||||
if len(cookies) > 0 {
|
||||
bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain))
|
||||
@ -252,10 +227,67 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
||||
return err
|
||||
}
|
||||
|
||||
func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error {
|
||||
if err := bCtx.Route("**", func(route playwright.Route) {
|
||||
log.Debugf("Route: %s", route.Request().URL())
|
||||
allowHost, err := e.allowHost(route.Request().URL())
|
||||
if err != nil {
|
||||
log.Errorf("Allow host: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
URL, err := url.Parse(route.Request().URL())
|
||||
if err != nil {
|
||||
log.Errorf("Interceptor parse url: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
allowHost = allowHost && allowAdblock(URL)
|
||||
if allowHost {
|
||||
e.allowed++
|
||||
if err := route.Continue(); err != nil {
|
||||
log.Warnf("Route continue error: %v", err)
|
||||
}
|
||||
} else {
|
||||
e.blocked++
|
||||
if err := route.Abort(); err != nil {
|
||||
log.Warnf("Route abort error: %v", err)
|
||||
}
|
||||
}
|
||||
}); err != nil {
|
||||
return fmt.Errorf("set route: %w", err)
|
||||
}
|
||||
|
||||
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
|
||||
log.Debugf("Websocket route: %s", route.URL())
|
||||
allowHost, err := e.allowHost(route.URL())
|
||||
if err != nil {
|
||||
log.Errorf("Allow host: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
URL, err := url.Parse(route.URL())
|
||||
if err != nil {
|
||||
log.Errorf("Interceptor websocket parse url: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
allowHost = allowHost && allowAdblock(URL)
|
||||
if allowHost {
|
||||
e.allowed++
|
||||
if _, err := route.ConnectToServer(); err != nil {
|
||||
log.Warnf("Websocket connect error: %v", err)
|
||||
}
|
||||
} else {
|
||||
e.blocked++
|
||||
route.Close()
|
||||
}
|
||||
}); err != nil {
|
||||
return fmt.Errorf("websocket set route: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e *PwExtractor) allowHost(rawUrl string) (bool, error) {
|
||||
ips, err := getIPs(rawUrl)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("allow host get ips: %w", err)
|
||||
return false, fmt.Errorf("allowAdblock host get ips: %w", err)
|
||||
}
|
||||
for _, ip := range ips {
|
||||
deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user