This commit is contained in:
Egor Aristov 2025-04-24 22:20:37 +03:00
parent 31bbc97f9b
commit e0af3770f6
Signed by: egor3f
GPG Key ID: 40482A264AAEC85F
6 changed files with 117388 additions and 5 deletions

11
go.mod
View File

@ -1,6 +1,8 @@
module github.com/egor3f/rssalchemy module github.com/egor3f/rssalchemy
go 1.23 go 1.23.2
toolchain go1.24.0
require ( require (
github.com/ericchiang/css v1.4.0 github.com/ericchiang/css v1.4.0
@ -24,6 +26,8 @@ require (
) )
require ( require (
github.com/AdguardTeam/golibs v0.29.0 // indirect
github.com/AdguardTeam/urlfilter v0.20.0 // indirect
github.com/BurntSushi/toml v1.2.1 // indirect github.com/BurntSushi/toml v1.2.1 // indirect
github.com/alessandro-c/gomemcached-lock v1.0.0 // indirect github.com/alessandro-c/gomemcached-lock v1.0.0 // indirect
github.com/armon/go-metrics v0.4.1 // indirect github.com/armon/go-metrics v0.4.1 // indirect
@ -74,6 +78,7 @@ require (
github.com/magefile/mage v1.14.0 // indirect github.com/magefile/mage v1.14.0 // indirect
github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-colorable v0.1.14 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-isatty v0.0.20 // indirect
github.com/miekg/dns v1.1.61 // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/nats-io/nkeys v0.4.9 // indirect github.com/nats-io/nkeys v0.4.9 // indirect
@ -93,11 +98,13 @@ require (
go.uber.org/multierr v1.9.0 // indirect go.uber.org/multierr v1.9.0 // indirect
go.uber.org/zap v1.24.0 // indirect go.uber.org/zap v1.24.0 // indirect
golang.org/x/crypto v0.32.0 // indirect golang.org/x/crypto v0.32.0 // indirect
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 // indirect golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect
golang.org/x/mod v0.21.0 // indirect
golang.org/x/net v0.34.0 // indirect golang.org/x/net v0.34.0 // indirect
golang.org/x/sync v0.10.0 // indirect golang.org/x/sync v0.10.0 // indirect
golang.org/x/sys v0.29.0 // indirect golang.org/x/sys v0.29.0 // indirect
golang.org/x/text v0.21.0 // indirect golang.org/x/text v0.21.0 // indirect
golang.org/x/tools v0.25.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240814211410-ddb44dafa142 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240814211410-ddb44dafa142 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240814211410-ddb44dafa142 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240814211410-ddb44dafa142 // indirect
google.golang.org/grpc v1.67.1 // indirect google.golang.org/grpc v1.67.1 // indirect

12
go.sum
View File

@ -1,3 +1,7 @@
github.com/AdguardTeam/golibs v0.29.0 h1:NG3eUXaUwRTgKssblolh4XHME8MQCCdogyIZxxv4bOU=
github.com/AdguardTeam/golibs v0.29.0/go.mod h1:vjw1OVZG6BYyoqGRY88U4LCJLOMfhBFhU0UJBdaSAuQ=
github.com/AdguardTeam/urlfilter v0.20.0 h1:X32qiuVCVd8WDYCEsbdZKfXMzwdVqrdulamtUi4rmzs=
github.com/AdguardTeam/urlfilter v0.20.0/go.mod h1:gjrywLTxfJh6JOkwi9SU+frhP7kVVEZ5exFGkR99qpk=
github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak=
github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
@ -286,6 +290,8 @@ github.com/mennanov/limiters v1.11.0/go.mod h1:NFf49GLfiywZ4DFkqK9Ne7e+Ckwl1q0eS
github.com/miekg/dns v1.1.26/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso= github.com/miekg/dns v1.1.26/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso=
github.com/miekg/dns v1.1.41 h1:WMszZWJG0XmzbK9FEmzH2TVcqYzFesusSIB41b8KHxY= github.com/miekg/dns v1.1.41 h1:WMszZWJG0XmzbK9FEmzH2TVcqYzFesusSIB41b8KHxY=
github.com/miekg/dns v1.1.41/go.mod h1:p6aan82bvRIyn+zDIv9xYNUpwa73JcSh9BKwknJysuI= github.com/miekg/dns v1.1.41/go.mod h1:p6aan82bvRIyn+zDIv9xYNUpwa73JcSh9BKwknJysuI=
github.com/miekg/dns v1.1.61 h1:nLxbwF3XxhwVSm8g9Dghm9MHPaUZuqhPiGL+675ZmEs=
github.com/miekg/dns v1.1.61/go.mod h1:mnAarhS3nWaW+NVP2wTkYVIZyHNJ098SJZUki3eykwQ=
github.com/mitchellh/cli v1.1.0/go.mod h1:xcISNoH86gajksDmfB23e/pu+B+GeFRMYmoHXxx3xhI= github.com/mitchellh/cli v1.1.0/go.mod h1:xcISNoH86gajksDmfB23e/pu+B+GeFRMYmoHXxx3xhI=
github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
@ -420,10 +426,14 @@ golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc=
golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk=
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0=
golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
@ -518,6 +528,8 @@ golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4f
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE=
golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

View File

@ -0,0 +1,51 @@
package pwextractor
import (
_ "embed"
"fmt"
"github.com/AdguardTeam/urlfilter"
"github.com/AdguardTeam/urlfilter/filterlist"
"github.com/AdguardTeam/urlfilter/rules"
"github.com/labstack/gommon/log"
"net/url"
)
//go:embed blocklists/easylist.txt
var easyList string
//go:embed blocklists/easyprivacy.txt
var easyPrivacy string
var ruleLists = []string{
easyList,
easyPrivacy,
}
var engine *urlfilter.Engine
func init() {
lists := make([]filterlist.RuleList, len(ruleLists))
for i, rulesStr := range ruleLists {
lists[i] = &filterlist.StringRuleList{
RulesText: rulesStr,
ID: i,
IgnoreCosmetic: true,
}
}
storage, err := filterlist.NewRuleStorage(lists)
if err != nil {
panic(fmt.Sprintf("initialize adblock: NewRuleStorage: %v", err))
}
engine = urlfilter.NewEngine(storage)
}
func allowAdblock(url *url.URL, sourceUrl *url.URL) bool {
req := rules.NewRequest(url.String(), sourceUrl.String(), rules.TypeOther)
res := engine.MatchRequest(req)
rule := res.GetBasicResult()
allow := rule == nil || rule.Whitelist
if !allow {
log.Infof("Adblock blocked %s from %s by rule %s", url, sourceUrl, rule.String())
}
return allow
}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -10,6 +10,7 @@ import (
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"maps" "maps"
"net" "net"
"net/url"
"strings" "strings"
"time" "time"
) )
@ -33,6 +34,8 @@ type PwExtractor struct {
cookieManager CookieManager cookieManager CookieManager
limiter limiter.Limiter limiter limiter.Limiter
proxyIP net.IP proxyIP net.IP
allowed int
blocked int
} }
type Config struct { type Config struct {
@ -99,6 +102,11 @@ const MAX_RETRIES = 3 // todo: config
func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) error) (errRet error) { func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) error) (errRet error) {
taskUrl, err := url.Parse(task.URL)
if err != nil {
return fmt.Errorf("parse task url: %w", err)
}
baseDomain, scheme, err := parseBaseDomain(task.URL) baseDomain, scheme, err := parseBaseDomain(task.URL)
if err != nil { if err != nil {
return fmt.Errorf("parse base domain: %w", err) return fmt.Errorf("parse base domain: %w", err)
@ -145,7 +153,7 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
} }
}() }()
if err := e.setupInterceptors(bCtx); err != nil { if err := e.setupInterceptors(bCtx, taskUrl); err != nil {
return fmt.Errorf("setup interceptors: %w", err) return fmt.Errorf("setup interceptors: %w", err)
} }
@ -197,7 +205,13 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
start := time.Now() start := time.Now()
err = cb(page) err = cb(page)
log.Infof("Visiting page %s finished, time=%f secs, err=%v", task.URL, time.Since(start).Seconds(), err) log.Infof(
"Visiting page %s finished, time=%f secs, allowed hosts=%d, blocked hosts=%d, err=%v",
task.URL,
time.Since(start).Seconds(),
e.allowed, e.blocked,
err,
)
if len(cookies) > 0 { if len(cookies) > 0 {
bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain)) bCookies, err := bCtx.Cookies(fmt.Sprintf("%s://%s", scheme, baseDomain))
@ -218,7 +232,7 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
return err return err
} }
func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error { func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext, sourceUrl *url.URL) error {
if err := bCtx.Route("**", func(route playwright.Route) { if err := bCtx.Route("**", func(route playwright.Route) {
log.Debugf("Route: %s", route.Request().URL()) log.Debugf("Route: %s", route.Request().URL())
allowHost, err := e.allowHost(route.Request().URL()) allowHost, err := e.allowHost(route.Request().URL())
@ -226,11 +240,19 @@ func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error {
log.Errorf("Allow host: %v", err) log.Errorf("Allow host: %v", err)
allowHost = false allowHost = false
} }
URL, err := url.Parse(route.Request().URL())
if err != nil {
log.Errorf("Interceptor parse url: %v", err)
allowHost = false
}
allowHost = allowHost && allowAdblock(URL, sourceUrl)
if allowHost { if allowHost {
e.allowed++
if err := route.Continue(); err != nil { if err := route.Continue(); err != nil {
log.Warnf("Route continue error: %v", err) log.Warnf("Route continue error: %v", err)
} }
} else { } else {
e.blocked++
if err := route.Abort(); err != nil { if err := route.Abort(); err != nil {
log.Warnf("Route abort error: %v", err) log.Warnf("Route abort error: %v", err)
} }
@ -246,11 +268,19 @@ func (e *PwExtractor) setupInterceptors(bCtx playwright.BrowserContext) error {
log.Errorf("Allow host: %v", err) log.Errorf("Allow host: %v", err)
allowHost = false allowHost = false
} }
URL, err := url.Parse(route.URL())
if err != nil {
log.Errorf("Interceptor websocket parse url: %v", err)
allowHost = false
}
allowHost = allowHost && allowAdblock(URL, sourceUrl)
if allowHost { if allowHost {
e.allowed++
if _, err := route.ConnectToServer(); err != nil { if _, err := route.ConnectToServer(); err != nil {
log.Warnf("Websocket connect error: %v", err) log.Warnf("Websocket connect error: %v", err)
} }
} else { } else {
e.blocked++
route.Close() route.Close()
} }
}); err != nil { }); err != nil {