adblock
This commit is contained in:
parent
31bbc97f9b
commit
e0af3770f6
11
go.mod
11
go.mod
@ -1,6 +1,8 @@
|
||||
module github.com/egor3f/rssalchemy
|
||||
|
||||
go 1.23
|
||||
go 1.23.2
|
||||
|
||||
toolchain go1.24.0
|
||||
|
||||
require (
|
||||
github.com/ericchiang/css v1.4.0
|
||||
@ -24,6 +26,8 @@ require (
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/AdguardTeam/golibs v0.29.0 // indirect
|
||||
github.com/AdguardTeam/urlfilter v0.20.0 // indirect
|
||||
github.com/BurntSushi/toml v1.2.1 // indirect
|
||||
github.com/alessandro-c/gomemcached-lock v1.0.0 // indirect
|
||||
github.com/armon/go-metrics v0.4.1 // indirect
|
||||
@ -74,6 +78,7 @@ require (
|
||||
github.com/magefile/mage v1.14.0 // indirect
|
||||
github.com/mattn/go-colorable v0.1.14 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/miekg/dns v1.1.61 // indirect
|
||||
github.com/mitchellh/go-homedir v1.1.0 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/nats-io/nkeys v0.4.9 // indirect
|
||||
@ -93,11 +98,13 @@ require (
|
||||
go.uber.org/multierr v1.9.0 // indirect
|
||||
go.uber.org/zap v1.24.0 // indirect
|
||||
golang.org/x/crypto v0.32.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 // indirect
|
||||
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect
|
||||
golang.org/x/mod v0.21.0 // indirect
|
||||
golang.org/x/net v0.34.0 // indirect
|
||||
golang.org/x/sync v0.10.0 // indirect
|
||||
golang.org/x/sys v0.29.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
golang.org/x/tools v0.25.0 // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20240814211410-ddb44dafa142 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20240814211410-ddb44dafa142 // indirect
|
||||
google.golang.org/grpc v1.67.1 // indirect
|
||||
|
||||
12
go.sum
12
go.sum
@ -1,3 +1,7 @@
|
||||
github.com/AdguardTeam/golibs v0.29.0 h1:NG3eUXaUwRTgKssblolh4XHME8MQCCdogyIZxxv4bOU=
|
||||
github.com/AdguardTeam/golibs v0.29.0/go.mod h1:vjw1OVZG6BYyoqGRY88U4LCJLOMfhBFhU0UJBdaSAuQ=
|
||||
github.com/AdguardTeam/urlfilter v0.20.0 h1:X32qiuVCVd8WDYCEsbdZKfXMzwdVqrdulamtUi4rmzs=
|
||||
github.com/AdguardTeam/urlfilter v0.20.0/go.mod h1:gjrywLTxfJh6JOkwi9SU+frhP7kVVEZ5exFGkR99qpk=
|
||||
github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak=
|
||||
github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
|
||||
github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
|
||||
@ -286,6 +290,8 @@ github.com/mennanov/limiters v1.11.0/go.mod h1:NFf49GLfiywZ4DFkqK9Ne7e+Ckwl1q0eS
|
||||
github.com/miekg/dns v1.1.26/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso=
|
||||
github.com/miekg/dns v1.1.41 h1:WMszZWJG0XmzbK9FEmzH2TVcqYzFesusSIB41b8KHxY=
|
||||
github.com/miekg/dns v1.1.41/go.mod h1:p6aan82bvRIyn+zDIv9xYNUpwa73JcSh9BKwknJysuI=
|
||||
github.com/miekg/dns v1.1.61 h1:nLxbwF3XxhwVSm8g9Dghm9MHPaUZuqhPiGL+675ZmEs=
|
||||
github.com/miekg/dns v1.1.61/go.mod h1:mnAarhS3nWaW+NVP2wTkYVIZyHNJ098SJZUki3eykwQ=
|
||||
github.com/mitchellh/cli v1.1.0/go.mod h1:xcISNoH86gajksDmfB23e/pu+B+GeFRMYmoHXxx3xhI=
|
||||
github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
|
||||
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
|
||||
@ -420,10 +426,14 @@ golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc=
|
||||
golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
|
||||
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
|
||||
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
|
||||
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk=
|
||||
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY=
|
||||
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0=
|
||||
golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
|
||||
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
@ -518,6 +528,8 @@ golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4f
|
||||
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE=
|
||||
golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
|
||||
51
internal/extractors/pwextractor/adblock.go
Normal file
51
internal/extractors/pwextractor/adblock.go
Normal file
@ -0,0 +1,51 @@
|
||||
package pwextractor
|
||||
|
||||
import (
|
||||
_ "embed"
|
||||
"fmt"
|
||||
"github.com/AdguardTeam/urlfilter"
|
||||
"github.com/AdguardTeam/urlfilter/filterlist"
|
||||
"github.com/AdguardTeam/urlfilter/rules"
|
||||
"github.com/labstack/gommon/log"
|
||||
"net/url"
|
||||
)
|
||||
|
||||
//go:embed blocklists/easylist.txt
|
||||
var easyList string
|
||||
|
||||
//go:embed blocklists/easyprivacy.txt
|
||||
var easyPrivacy string
|
||||
|
||||
var ruleLists = []string{
|
||||
easyList,
|
||||
easyPrivacy,
|
||||
}
|
||||
|
||||
var engine *urlfilter.Engine
|
||||
|
||||
func init() {
|
||||
lists := make([]filterlist.RuleList, len(ruleLists))
|
||||
for i, rulesStr := range ruleLists {
|
||||
lists[i] = &filterlist.StringRuleList{
|
||||
RulesText: rulesStr,
|
||||
ID: i,
|
||||
IgnoreCosmetic: true,
|
||||
}
|
||||
}
|
||||
storage, err := filterlist.NewRuleStorage(lists)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("initialize adblock: NewRuleStorage: %v", err))
|
||||
}
|
||||
engine = urlfilter.NewEngine(storage)
|
||||
}
|
||||
|
||||
func allowAdblock(url *url.URL, sourceUrl *url.URL) bool {
|
||||
req := rules.NewRequest(url.String(), sourceUrl.String(), rules.TypeOther)
|
||||
res := engine.MatchRequest(req)
|
||||
rule := res.GetBasicResult()
|
||||
allow := rule == nil || rule.Whitelist
|
||||
if !allow {
|
||||
log.Infof("Adblock blocked %s from %s by rule %s", url, sourceUrl, rule.String())
|
||||
}
|
||||
return allow
|
||||
}
|
||||
63423
internal/extractors/pwextractor/blocklists/easylist.txt
Normal file
63423
internal/extractors/pwextractor/blocklists/easylist.txt
Normal file
File diff suppressed because it is too large
Load Diff
53860
internal/extractors/pwextractor/blocklists/easyprivacy.txt
Normal file
53860
internal/extractors/pwextractor/blocklists/easyprivacy.txt
Normal file
File diff suppressed because one or more lines are too long
@ -10,6 +10,7 @@ import (
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"maps"
|
||||
"net"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@ -33,6 +34,8 @@ type PwExtractor struct {
|
||||
cookieManager CookieManager
|
||||
limiter limiter.Limiter
|
||||
proxyIP net.IP
|
||||
allowed int
|
||||
blocked int
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
@ -99,6 +102,11 @@ const MAX_RETRIES = 3 // todo: config
|
||||
|
||||
func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) error) (errRet error) {
|
||||
|
||||
taskUrl, err := url.Parse(task.URL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse task url: %w", err)
|
||||
}
|
||||
|
||||
baseDomain, scheme, err := parseBaseDomain(task.URL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse base domain: %w", err)
|
||||
@ -145,7 +153,7 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
||||
}
|
||||
}()
|
||||
|
||||
if err := e.setupInterceptors(bCtx); err != nil {
|
||||
if err := e.setupInterceptors(bCtx, taskUrl); err != nil {
|
||||
return fmt.Errorf("setup interceptors: %w", err)
|
||||
}
|
||||
|
||||
@ -197,7 +205,13 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
||||
|
||||
start := time.Now()
|
||||
err = cb(page)
|
||||
log.Infof("Visiting page %s finished, time=%f secs, err=%v", task.URL, time.Since(start).Seconds(), err)
|
||||
log.Infof(
|
||||
"Visiting page %s finished, time=%f secs, allowed hosts=%d, blocked hosts=%d, err=%v",
|
||||
task.URL,
|
||||
time.Since(start).Seconds(),
|
||||
e.allowed, e.blocked,
|
||||
err,
|
||||
)
|
||||
|
||||
if len(cookies) > 0 {
|
||||
bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain))
|
||||
@ -218,7 +232,7 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
||||
return err
|
||||
}
|
||||
|
||||
func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error {
|
||||
func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext, sourceUrl *url.URL) error {
|
||||
if err := bCtx.Route("**", func(route playwright.Route) {
|
||||
log.Debugf("Route: %s", route.Request().URL())
|
||||
allowHost, err := e.allowHost(route.Request().URL())
|
||||
@ -226,11 +240,19 @@ func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error {
|
||||
log.Errorf("Allow host: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
URL, err := url.Parse(route.Request().URL())
|
||||
if err != nil {
|
||||
log.Errorf("Interceptor parse url: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
allowHost = allowHost && allowAdblock(URL, sourceUrl)
|
||||
if allowHost {
|
||||
e.allowed++
|
||||
if err := route.Continue(); err != nil {
|
||||
log.Warnf("Route continue error: %v", err)
|
||||
}
|
||||
} else {
|
||||
e.blocked++
|
||||
if err := route.Abort(); err != nil {
|
||||
log.Warnf("Route abort error: %v", err)
|
||||
}
|
||||
@ -246,11 +268,19 @@ func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error {
|
||||
log.Errorf("Allow host: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
URL, err := url.Parse(route.URL())
|
||||
if err != nil {
|
||||
log.Errorf("Interceptor websocket parse url: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
allowHost = allowHost && allowAdblock(URL, sourceUrl)
|
||||
if allowHost {
|
||||
e.allowed++
|
||||
if _, err := route.ConnectToServer(); err != nil {
|
||||
log.Warnf("Websocket connect error: %v", err)
|
||||
}
|
||||
} else {
|
||||
e.blocked++
|
||||
route.Close()
|
||||
}
|
||||
}); err != nil {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user