adblock
This commit is contained in:
parent
fbf2c9be50
commit
1b91f55fd0
2
go.mod
2
go.mod
@ -78,9 +78,11 @@ require (
|
|||||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||||
github.com/nats-io/nkeys v0.4.9 // indirect
|
github.com/nats-io/nkeys v0.4.9 // indirect
|
||||||
github.com/nats-io/nuid v1.0.1 // indirect
|
github.com/nats-io/nuid v1.0.1 // indirect
|
||||||
|
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc // indirect
|
||||||
github.com/pkg/errors v0.9.1 // indirect
|
github.com/pkg/errors v0.9.1 // indirect
|
||||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||||
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 // indirect
|
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 // indirect
|
||||||
|
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6 // indirect
|
||||||
github.com/tetratelabs/wazero v1.2.1 // indirect
|
github.com/tetratelabs/wazero v1.2.1 // indirect
|
||||||
github.com/thanhpk/randstr v1.0.4 // indirect
|
github.com/thanhpk/randstr v1.0.4 // indirect
|
||||||
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
||||||
|
|||||||
4
go.sum
4
go.sum
@ -305,6 +305,8 @@ github.com/onsi/gomega v1.16.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAl
|
|||||||
github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
|
github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
|
||||||
github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY=
|
github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY=
|
||||||
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
|
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
|
||||||
|
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc h1:bkt157sOZooZKI1vnYwDWsIg4aU9GHIIych2fPT7QLw=
|
||||||
|
github.com/patriciy/adblock v0.0.0-20201201143319-2c60183c9ccc/go.mod h1:RzOK93vWuT8huhjtK41YJc5tpEW7pn5sHKgo77xZg6E=
|
||||||
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||||
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||||
@ -336,6 +338,8 @@ github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUz
|
|||||||
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
|
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
|
||||||
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 h1:AJNDS0kP60X8wwWFvbLPwDuojxubj9pbfK7pjHw0vKg=
|
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 h1:AJNDS0kP60X8wwWFvbLPwDuojxubj9pbfK7pjHw0vKg=
|
||||||
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E=
|
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E=
|
||||||
|
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6 h1:3vph+oCRtCUuedn0i28o6tNmXYV9JC6WW+opu1As8xI=
|
||||||
|
github.com/scrapinghub/adblockgoparser v0.0.0-20200421080733-539a8d1534d6/go.mod h1:+p9xynCxNcbR3dUwpBXbfyWATZc53NujWurfrsOgdQk=
|
||||||
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I=
|
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I=
|
||||||
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
|
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
|
||||||
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
|
||||||
|
|||||||
49
internal/extractors/pwextractor/adblock.go
Normal file
49
internal/extractors/pwextractor/adblock.go
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
package pwextractor
|
||||||
|
|
||||||
|
import (
|
||||||
|
_ "embed"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"github.com/labstack/gommon/log"
|
||||||
|
"github.com/scrapinghub/adblockgoparser"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
//go:embed blocklists/easylist.txt
|
||||||
|
var easyList string
|
||||||
|
|
||||||
|
//go:embed blocklists/easyprivacy.txt
|
||||||
|
var easyPrivacy string
|
||||||
|
|
||||||
|
var ruleSet *adblockgoparser.RuleSet
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
ruleSet = adblockgoparser.CreateRuleSet()
|
||||||
|
for _, list := range []string{easyList, easyPrivacy} {
|
||||||
|
for _, rec := range strings.Split(list, "\n") {
|
||||||
|
rule, err := adblockgoparser.ParseRule(rec)
|
||||||
|
if err != nil {
|
||||||
|
if !errors.Is(err, adblockgoparser.ErrSkipComment) &&
|
||||||
|
!errors.Is(err, adblockgoparser.ErrUnsupportedRule) &&
|
||||||
|
!errors.Is(err, adblockgoparser.ErrSkipHTML) &&
|
||||||
|
!errors.Is(err, adblockgoparser.ErrEmptyLine) {
|
||||||
|
panic(fmt.Sprintf("Adblock rule parse: %v", err))
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ruleSet.AddRule(rule)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func allowAdblock(url *url.URL) bool {
|
||||||
|
req := adblockgoparser.Request{
|
||||||
|
URL: url,
|
||||||
|
}
|
||||||
|
allow := ruleSet.Allow(&req)
|
||||||
|
if !allow {
|
||||||
|
log.Infof("Adblock blocked %s", url.String())
|
||||||
|
}
|
||||||
|
return allow
|
||||||
|
}
|
||||||
63423
internal/extractors/pwextractor/blocklists/easylist.txt
Normal file
63423
internal/extractors/pwextractor/blocklists/easylist.txt
Normal file
File diff suppressed because it is too large
Load Diff
53860
internal/extractors/pwextractor/blocklists/easyprivacy.txt
Normal file
53860
internal/extractors/pwextractor/blocklists/easyprivacy.txt
Normal file
File diff suppressed because one or more lines are too long
@ -10,6 +10,7 @@ import (
|
|||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
"maps"
|
"maps"
|
||||||
"net"
|
"net"
|
||||||
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@ -33,6 +34,8 @@ type PwExtractor struct {
|
|||||||
cookieManager CookieManager
|
cookieManager CookieManager
|
||||||
limiter limiter.Limiter
|
limiter limiter.Limiter
|
||||||
proxyIP net.IP
|
proxyIP net.IP
|
||||||
|
allowed int
|
||||||
|
blocked int
|
||||||
}
|
}
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
@ -145,42 +148,8 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
if err := bCtx.Route("**", func(route playwright.Route) {
|
if err := e.setupInterceptors(bCtx); err != nil {
|
||||||
log.Debugf("Route: %s", route.Request().URL())
|
return fmt.Errorf("setup interceptors: %w", err)
|
||||||
allowHost, err := e.allowHost(route.Request().URL())
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("Allow host: %v", err)
|
|
||||||
allowHost = false
|
|
||||||
}
|
|
||||||
if allowHost {
|
|
||||||
if err := route.Continue(); err != nil {
|
|
||||||
log.Warnf("Route continue error: %v", err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if err := route.Abort(); err != nil {
|
|
||||||
log.Warnf("Route abort error: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}); err != nil {
|
|
||||||
return fmt.Errorf("set route: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
|
|
||||||
log.Debugf("Websocket route: %s", route.URL())
|
|
||||||
allowHost, err := e.allowHost(route.URL())
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("Allow host: %v", err)
|
|
||||||
allowHost = false
|
|
||||||
}
|
|
||||||
if allowHost {
|
|
||||||
if _, err := route.ConnectToServer(); err != nil {
|
|
||||||
log.Warnf("Websocket connect error: %v", err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
route.Close()
|
|
||||||
}
|
|
||||||
}); err != nil {
|
|
||||||
return fmt.Errorf("websocket set route: %w", err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(cookies) > 0 {
|
if len(cookies) > 0 {
|
||||||
@ -231,7 +200,13 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
|||||||
|
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
err = cb(page)
|
err = cb(page)
|
||||||
log.Infof("Visiting page %s finished, time=%f secs, err=%v", task.URL, time.Since(start).Seconds(), err)
|
log.Infof(
|
||||||
|
"Visiting page %s finished, time=%f secs, allowed hosts=%d, blocked hosts=%d, err=%v",
|
||||||
|
task.URL,
|
||||||
|
time.Since(start).Seconds(),
|
||||||
|
e.allowed, e.blocked,
|
||||||
|
err,
|
||||||
|
)
|
||||||
|
|
||||||
if len(cookies) > 0 {
|
if len(cookies) > 0 {
|
||||||
bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain))
|
bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain))
|
||||||
@ -252,10 +227,67 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error {
|
||||||
|
if err := bCtx.Route("**", func(route playwright.Route) {
|
||||||
|
log.Debugf("Route: %s", route.Request().URL())
|
||||||
|
allowHost, err := e.allowHost(route.Request().URL())
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Allow host: %v", err)
|
||||||
|
allowHost = false
|
||||||
|
}
|
||||||
|
URL, err := url.Parse(route.Request().URL())
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Interceptor parse url: %v", err)
|
||||||
|
allowHost = false
|
||||||
|
}
|
||||||
|
allowHost = allowHost && allowAdblock(URL)
|
||||||
|
if allowHost {
|
||||||
|
e.allowed++
|
||||||
|
if err := route.Continue(); err != nil {
|
||||||
|
log.Warnf("Route continue error: %v", err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
e.blocked++
|
||||||
|
if err := route.Abort(); err != nil {
|
||||||
|
log.Warnf("Route abort error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}); err != nil {
|
||||||
|
return fmt.Errorf("set route: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
|
||||||
|
log.Debugf("Websocket route: %s", route.URL())
|
||||||
|
allowHost, err := e.allowHost(route.URL())
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Allow host: %v", err)
|
||||||
|
allowHost = false
|
||||||
|
}
|
||||||
|
URL, err := url.Parse(route.URL())
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Interceptor websocket parse url: %v", err)
|
||||||
|
allowHost = false
|
||||||
|
}
|
||||||
|
allowHost = allowHost && allowAdblock(URL)
|
||||||
|
if allowHost {
|
||||||
|
e.allowed++
|
||||||
|
if _, err := route.ConnectToServer(); err != nil {
|
||||||
|
log.Warnf("Websocket connect error: %v", err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
e.blocked++
|
||||||
|
route.Close()
|
||||||
|
}
|
||||||
|
}); err != nil {
|
||||||
|
return fmt.Errorf("websocket set route: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (e *PwExtractor) allowHost(rawUrl string) (bool, error) {
|
func (e *PwExtractor) allowHost(rawUrl string) (bool, error) {
|
||||||
ips, err := getIPs(rawUrl)
|
ips, err := getIPs(rawUrl)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, fmt.Errorf("allow host get ips: %w", err)
|
return false, fmt.Errorf("allowAdblock host get ips: %w", err)
|
||||||
}
|
}
|
||||||
for _, ip := range ips {
|
for _, ip := range ips {
|
||||||
deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast()
|
deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user