block local ips and service worker; util function to resolve arbitrary 'host-like' string, unit testing

This commit is contained in:
Egor Aristov 2025-02-19 12:10:35 +03:00
parent 8d21583f9e
commit 08326debdd
Signed by: egor3f
GPG Key ID: 40482A264AAEC85F
5 changed files with 233 additions and 5 deletions

6
go.mod
View File

@ -8,6 +8,7 @@ require (
github.com/go-redsync/redsync/v4 v4.8.1 github.com/go-redsync/redsync/v4 v4.8.1
github.com/gorilla/feeds v1.2.0 github.com/gorilla/feeds v1.2.0
github.com/ilyakaznacheev/cleanenv v1.5.0 github.com/ilyakaznacheev/cleanenv v1.5.0
github.com/jellydator/ttlcache/v3 v3.3.0
github.com/labstack/echo/v4 v4.13.3 github.com/labstack/echo/v4 v4.13.3
github.com/labstack/gommon v0.4.2 github.com/labstack/gommon v0.4.2
github.com/markusmobius/go-dateparser v1.2.3 github.com/markusmobius/go-dateparser v1.2.3
@ -15,6 +16,7 @@ require (
github.com/nats-io/nats.go v1.38.0 github.com/nats-io/nats.go v1.38.0
github.com/playwright-community/playwright-go v0.5001.0 github.com/playwright-community/playwright-go v0.5001.0
github.com/redis/go-redis/v9 v9.7.0 github.com/redis/go-redis/v9 v9.7.0
github.com/stretchr/testify v1.10.0
golang.org/x/time v0.8.0 golang.org/x/time v0.8.0
) )
@ -36,6 +38,7 @@ require (
github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/coreos/go-semver v0.3.1 // indirect github.com/coreos/go-semver v0.3.1 // indirect
github.com/coreos/go-systemd/v22 v22.5.0 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/deckarep/golang-set/v2 v2.7.0 // indirect github.com/deckarep/golang-set/v2 v2.7.0 // indirect
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
github.com/elliotchance/pie/v2 v2.7.0 // indirect github.com/elliotchance/pie/v2 v2.7.0 // indirect
@ -72,8 +75,8 @@ require (
github.com/nats-io/nkeys v0.4.9 // indirect github.com/nats-io/nkeys v0.4.9 // indirect
github.com/nats-io/nuid v1.0.1 // indirect github.com/nats-io/nuid v1.0.1 // indirect
github.com/pkg/errors v0.9.1 // indirect github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 // indirect github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/tetratelabs/wazero v1.2.1 // indirect github.com/tetratelabs/wazero v1.2.1 // indirect
github.com/thanhpk/randstr v1.0.4 // indirect github.com/thanhpk/randstr v1.0.4 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect github.com/valyala/bytebufferpool v1.0.0 // indirect
@ -88,6 +91,7 @@ require (
golang.org/x/crypto v0.32.0 // indirect golang.org/x/crypto v0.32.0 // indirect
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 // indirect golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 // indirect
golang.org/x/net v0.34.0 // indirect golang.org/x/net v0.34.0 // indirect
golang.org/x/sync v0.10.0 // indirect
golang.org/x/sys v0.29.0 // indirect golang.org/x/sys v0.29.0 // indirect
golang.org/x/text v0.21.0 // indirect golang.org/x/text v0.21.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240814211410-ddb44dafa142 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240814211410-ddb44dafa142 // indirect

10
go.sum
View File

@ -210,6 +210,8 @@ github.com/ilyakaznacheev/cleanenv v1.5.0 h1:0VNZXggJE2OYdXE87bfSSwGxeiGt9moSR2l
github.com/ilyakaznacheev/cleanenv v1.5.0/go.mod h1:a5aDzaJrLCQZsazHol1w8InnDcOX0OColm64SlIi6gk= github.com/ilyakaznacheev/cleanenv v1.5.0/go.mod h1:a5aDzaJrLCQZsazHol1w8InnDcOX0OColm64SlIi6gk=
github.com/jalaali/go-jalaali v0.0.0-20210801064154-80525e88d958 h1:qxLoi6CAcXVzjfvu+KXIXJOAsQB62LXjsfbOaErsVzE= github.com/jalaali/go-jalaali v0.0.0-20210801064154-80525e88d958 h1:qxLoi6CAcXVzjfvu+KXIXJOAsQB62LXjsfbOaErsVzE=
github.com/jalaali/go-jalaali v0.0.0-20210801064154-80525e88d958/go.mod h1:Wqfu7mjUHj9WDzSSPI5KfBclTTEnLveRUFr/ujWnTgE= github.com/jalaali/go-jalaali v0.0.0-20210801064154-80525e88d958/go.mod h1:Wqfu7mjUHj9WDzSSPI5KfBclTTEnLveRUFr/ujWnTgE=
github.com/jellydator/ttlcache/v3 v3.3.0 h1:BdoC9cE81qXfrxeb9eoJi9dWrdhSuwXMAnHTbnBm4Wc=
github.com/jellydator/ttlcache/v3 v3.3.0/go.mod h1:bj2/e0l4jRnQdrnSTaGTsh4GSXvMjQcy41i7th0GVGw=
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
@ -300,8 +302,6 @@ github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/playwright-community/playwright-go v0.4901.0 h1:d+1KxF5PNAHZ0gTMQ9bPSyYRWii8soJ7Rt0gLWDejc4=
github.com/playwright-community/playwright-go v0.4901.0/go.mod h1:kBNWs/w2aJ2ZUp1wEOOFLXgOqvppFngM5OS+qyhl+ZM=
github.com/playwright-community/playwright-go v0.5001.0 h1:EY3oB+rU9cUp6CLHguWE8VMZTwAg+83Yyb7dQqEmGLg= github.com/playwright-community/playwright-go v0.5001.0 h1:EY3oB+rU9cUp6CLHguWE8VMZTwAg+83Yyb7dQqEmGLg=
github.com/playwright-community/playwright-go v0.5001.0/go.mod h1:kBNWs/w2aJ2ZUp1wEOOFLXgOqvppFngM5OS+qyhl+ZM= github.com/playwright-community/playwright-go v0.5001.0/go.mod h1:kBNWs/w2aJ2ZUp1wEOOFLXgOqvppFngM5OS+qyhl+ZM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@ -375,8 +375,8 @@ go.etcd.io/etcd/client/v3 v3.5.17 h1:o48sINNeWz5+pjy/Z0+HKpj/xSnBkuVhVvXkjEXbqZY
go.etcd.io/etcd/client/v3 v3.5.17/go.mod h1:j2d4eXTHWkT2ClBgnnEPm/Wuu7jsqku41v9DZ3OtjQo= go.etcd.io/etcd/client/v3 v3.5.17/go.mod h1:j2d4eXTHWkT2ClBgnnEPm/Wuu7jsqku41v9DZ3OtjQo=
go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ= go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ=
go.uber.org/atomic v1.10.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/atomic v1.10.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
go.uber.org/goleak v1.1.11 h1:wy28qYRKZgnJTxGxvye5/wgWr1EKjmUDGYox5mGlRlI= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI=
go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ=
go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60=
@ -423,6 +423,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=

View File

@ -8,6 +8,7 @@ import (
"github.com/labstack/gommon/log" "github.com/labstack/gommon/log"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"maps" "maps"
"net"
"strings" "strings"
"time" "time"
) )
@ -30,6 +31,7 @@ type PwExtractor struct {
dateParser DateParser dateParser DateParser
cookieManager CookieManager cookieManager CookieManager
limiter limiter.Limiter limiter limiter.Limiter
proxyIP net.IP
} }
type Config struct { type Config struct {
@ -50,6 +52,13 @@ func New(cfg Config) (*PwExtractor, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("parse proxy: %w", err) return nil, fmt.Errorf("parse proxy: %w", err)
} }
if proxy != nil {
proxyIPs, err := getIPs(proxy.Server)
if err != nil {
return nil, fmt.Errorf("get proxy ip: %w", err)
}
e.proxyIP = proxyIPs[0]
}
e.chrome, err = e.pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{ e.chrome, err = e.pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
Args: []string{ Args: []string{
"--webrtc-ip-handling-policy=disable_non_proxied_udp", "--webrtc-ip-handling-policy=disable_non_proxied_udp",
@ -121,6 +130,8 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
bCtx, err := e.chrome.NewContext(playwright.BrowserNewContextOptions{ bCtx, err := e.chrome.NewContext(playwright.BrowserNewContextOptions{
ExtraHttpHeaders: headers, ExtraHttpHeaders: headers,
UserAgent: &userAgent, UserAgent: &userAgent,
ServiceWorkers: playwright.ServiceWorkerPolicyBlock,
AcceptDownloads: playwright.Bool(false),
}) })
if err != nil { if err != nil {
return fmt.Errorf("create browser context: %w", err) return fmt.Errorf("create browser context: %w", err)
@ -131,6 +142,44 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
} }
}() }()
if err := bCtx.Route("**", func(route playwright.Route) {
log.Debugf("Route: %s", route.Request().URL())
allowHost, err := e.allowHost(route.Request().URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
if allowHost {
if err := route.Continue(); err != nil {
log.Warnf("Route continue error: %v", err)
}
} else {
if err := route.Abort(); err != nil {
log.Warnf("Route abort error: %v", err)
}
}
}); err != nil {
return fmt.Errorf("set route: %w", err)
}
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
log.Debugf("Websocket route: %s", route.URL())
allowHost, err := e.allowHost(route.URL())
if err != nil {
log.Errorf("Allow host: %v", err)
allowHost = false
}
if allowHost {
if _, err := route.ConnectToServer(); err != nil {
log.Warnf("Websocket connect error: %v", err)
}
} else {
route.Close()
}
}); err != nil {
return fmt.Errorf("websocket set route: %w", err)
}
if len(cookies) > 0 { if len(cookies) > 0 {
var pwCookies []playwright.OptionalCookie var pwCookies []playwright.OptionalCookie
for _, cook := range cookies { for _, cook := range cookies {
@ -193,6 +242,22 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
return err return err
} }
func (e *PwExtractor) allowHost(rawUrl string) (bool, error) {
ips, err := getIPs(rawUrl)
if err != nil {
return false, fmt.Errorf("allow host get ips: %w", err)
}
for _, ip := range ips {
deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast()
deny = deny || e.proxyIP.Equal(ip)
if deny {
log.Debugf("Banned address: %s", rawUrl)
return false, nil
}
}
return true, nil
}
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) { func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
errRet = e.visitPage(task, func(page playwright.Page) error { errRet = e.visitPage(task, func(page playwright.Page) error {
parser := pageParser{ parser := pageParser{

View File

@ -2,7 +2,9 @@ package pwextractor
import ( import (
"fmt" "fmt"
"github.com/jellydator/ttlcache/v3"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"net"
"net/url" "net/url"
"slices" "slices"
"strings" "strings"
@ -66,3 +68,49 @@ func parseBaseDomain(urlStr string) (domain string, scheme string, err error) {
} }
return fmt.Sprintf("%s.%s", domainParts[1], domainParts[0]), scheme, nil return fmt.Sprintf("%s.%s", domainParts[1], domainParts[0]), scheme, nil
} }
var dnsCache *ttlcache.Cache[string, []net.IP]
func init() {
dnsCache = ttlcache.New[string, []net.IP](
ttlcache.WithTTL[string, []net.IP](1*time.Minute),
ttlcache.WithDisableTouchOnHit[string, []net.IP](),
)
go dnsCache.Start()
}
// getIPs from url, hostname, ip string
// result slice len always > 0 if error is nil
func getIPs(host string) ([]net.IP, error) {
ip := net.ParseIP(host)
if ip != nil {
return []net.IP{ip}, nil
}
urlStruct, err := url.Parse(host)
if err != nil {
return nil, fmt.Errorf("url parse: %w", err)
}
if len(urlStruct.Host) > 0 {
host = urlStruct.Hostname()
ip = net.ParseIP(host)
if ip != nil {
return []net.IP{ip}, nil
}
}
var ips []net.IP
if dnsCache.Has(host) {
ips = dnsCache.Get(host).Value()
} else {
ips, err = net.LookupIP(host)
if err != nil {
return nil, fmt.Errorf("lookup ip: %w", err)
}
dnsCache.Set(host, ips, ttlcache.DefaultTTL)
}
if len(ips) == 0 {
return nil, fmt.Errorf("lookip ip: not resolved")
}
return ips, nil
}

View File

@ -0,0 +1,109 @@
package pwextractor
import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
var testDomain = "dns.google"
var testResult = []string{
"2001:4860:4860::8844", "2001:4860:4860::8888", "8.8.8.8", "8.8.4.4",
}
func TestGetIPs(t *testing.T) {
tests := []struct {
name string
input string
expectedIPStrings []string
wantErr bool
}{
{
name: "valid IPv4",
input: "192.0.2.1",
expectedIPStrings: []string{"192.0.2.1"},
wantErr: false,
},
{
name: "valid IPv6",
input: "2001:db8::1",
expectedIPStrings: []string{"2001:db8::1"},
wantErr: false,
},
{
name: "URL with IPv4 host",
input: "http://192.0.2.1",
expectedIPStrings: []string{"192.0.2.1"},
wantErr: false,
},
{
name: "URL with IPv6 host",
input: "http://[2001:db8::1]",
expectedIPStrings: []string{"2001:db8::1"},
wantErr: false,
},
{
name: "URL with hostname",
input: fmt.Sprintf("https://%s:8080", testDomain),
expectedIPStrings: testResult,
wantErr: false,
},
{
name: "hostname",
input: testDomain,
expectedIPStrings: testResult,
wantErr: false,
},
{
name: "invalid IP address",
input: "256.0.0.0",
wantErr: true,
},
{
name: "empty input",
input: "",
wantErr: true,
},
{
name: "invalid URL format",
input: "://invalid",
wantErr: true,
},
{
name: "unresolvable hostname",
input: "nonexistent.invalid",
wantErr: true,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
ips, err := getIPs(tc.input)
if tc.wantErr {
require.Error(t, err)
assert.Nil(t, ips)
} else {
require.NoError(t, err)
require.NotEmpty(t, ips, "result slice should not be empty when error is nil")
if tc.expectedIPStrings != nil {
ipStrings := make([]string, len(ips))
for i, ip := range ips {
ipStrings[i] = ip.String()
}
assert.ElementsMatch(t, tc.expectedIPStrings, ipStrings, "IPs do not match expected")
}
}
})
}
t.Run("cache set", func(t *testing.T) {
dnsCache.DeleteAll()
require.False(t, dnsCache.Has(testDomain))
ips, _ := getIPs(testDomain)
require.True(t, dnsCache.Has(testDomain))
require.ElementsMatch(t, ips, dnsCache.Get(testDomain).Value())
})
}