block local ips and service worker; util function to resolve arbitrary 'host-like' string, unit testing
This commit is contained in:
parent
e13f05ea8a
commit
cd4b160b4d
6
go.mod
6
go.mod
@ -8,6 +8,7 @@ require (
|
||||
github.com/go-redsync/redsync/v4 v4.8.1
|
||||
github.com/gorilla/feeds v1.2.0
|
||||
github.com/ilyakaznacheev/cleanenv v1.5.0
|
||||
github.com/jellydator/ttlcache/v3 v3.3.0
|
||||
github.com/labstack/echo/v4 v4.13.3
|
||||
github.com/labstack/gommon v0.4.2
|
||||
github.com/markusmobius/go-dateparser v1.2.3
|
||||
@ -15,6 +16,7 @@ require (
|
||||
github.com/nats-io/nats.go v1.38.0
|
||||
github.com/playwright-community/playwright-go v0.5001.0
|
||||
github.com/redis/go-redis/v9 v9.7.0
|
||||
github.com/stretchr/testify v1.10.0
|
||||
golang.org/x/time v0.8.0
|
||||
)
|
||||
|
||||
@ -36,6 +38,7 @@ require (
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/coreos/go-semver v0.3.1 // indirect
|
||||
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/deckarep/golang-set/v2 v2.7.0 // indirect
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
|
||||
github.com/elliotchance/pie/v2 v2.7.0 // indirect
|
||||
@ -72,8 +75,8 @@ require (
|
||||
github.com/nats-io/nkeys v0.4.9 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 // indirect
|
||||
github.com/stretchr/objx v0.5.2 // indirect
|
||||
github.com/tetratelabs/wazero v1.2.1 // indirect
|
||||
github.com/thanhpk/randstr v1.0.4 // indirect
|
||||
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
||||
@ -88,6 +91,7 @@ require (
|
||||
golang.org/x/crypto v0.32.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 // indirect
|
||||
golang.org/x/net v0.34.0 // indirect
|
||||
golang.org/x/sync v0.10.0 // indirect
|
||||
golang.org/x/sys v0.29.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20240814211410-ddb44dafa142 // indirect
|
||||
|
||||
10
go.sum
10
go.sum
@ -210,6 +210,8 @@ github.com/ilyakaznacheev/cleanenv v1.5.0 h1:0VNZXggJE2OYdXE87bfSSwGxeiGt9moSR2l
|
||||
github.com/ilyakaznacheev/cleanenv v1.5.0/go.mod h1:a5aDzaJrLCQZsazHol1w8InnDcOX0OColm64SlIi6gk=
|
||||
github.com/jalaali/go-jalaali v0.0.0-20210801064154-80525e88d958 h1:qxLoi6CAcXVzjfvu+KXIXJOAsQB62LXjsfbOaErsVzE=
|
||||
github.com/jalaali/go-jalaali v0.0.0-20210801064154-80525e88d958/go.mod h1:Wqfu7mjUHj9WDzSSPI5KfBclTTEnLveRUFr/ujWnTgE=
|
||||
github.com/jellydator/ttlcache/v3 v3.3.0 h1:BdoC9cE81qXfrxeb9eoJi9dWrdhSuwXMAnHTbnBm4Wc=
|
||||
github.com/jellydator/ttlcache/v3 v3.3.0/go.mod h1:bj2/e0l4jRnQdrnSTaGTsh4GSXvMjQcy41i7th0GVGw=
|
||||
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
|
||||
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
|
||||
github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
|
||||
@ -300,8 +302,6 @@ github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
|
||||
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/playwright-community/playwright-go v0.4901.0 h1:d+1KxF5PNAHZ0gTMQ9bPSyYRWii8soJ7Rt0gLWDejc4=
|
||||
github.com/playwright-community/playwright-go v0.4901.0/go.mod h1:kBNWs/w2aJ2ZUp1wEOOFLXgOqvppFngM5OS+qyhl+ZM=
|
||||
github.com/playwright-community/playwright-go v0.5001.0 h1:EY3oB+rU9cUp6CLHguWE8VMZTwAg+83Yyb7dQqEmGLg=
|
||||
github.com/playwright-community/playwright-go v0.5001.0/go.mod h1:kBNWs/w2aJ2ZUp1wEOOFLXgOqvppFngM5OS+qyhl+ZM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
@ -375,8 +375,8 @@ go.etcd.io/etcd/client/v3 v3.5.17 h1:o48sINNeWz5+pjy/Z0+HKpj/xSnBkuVhVvXkjEXbqZY
|
||||
go.etcd.io/etcd/client/v3 v3.5.17/go.mod h1:j2d4eXTHWkT2ClBgnnEPm/Wuu7jsqku41v9DZ3OtjQo=
|
||||
go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ=
|
||||
go.uber.org/atomic v1.10.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
|
||||
go.uber.org/goleak v1.1.11 h1:wy28qYRKZgnJTxGxvye5/wgWr1EKjmUDGYox5mGlRlI=
|
||||
go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI=
|
||||
go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ=
|
||||
go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60=
|
||||
@ -423,6 +423,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
|
||||
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
|
||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"github.com/labstack/gommon/log"
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"maps"
|
||||
"net"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@ -30,6 +31,7 @@ type PwExtractor struct {
|
||||
dateParser DateParser
|
||||
cookieManager CookieManager
|
||||
limiter limiter.Limiter
|
||||
proxyIP net.IP
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
@ -50,6 +52,13 @@ func New(cfg Config) (*PwExtractor, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse proxy: %w", err)
|
||||
}
|
||||
if proxy != nil {
|
||||
proxyIPs, err := getIPs(proxy.Server)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get proxy ip: %w", err)
|
||||
}
|
||||
e.proxyIP = proxyIPs[0]
|
||||
}
|
||||
e.chrome, err = e.pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Args: []string{
|
||||
"--webrtc-ip-handling-policy=disable_non_proxied_udp",
|
||||
@ -121,6 +130,8 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
||||
bCtx, err := e.chrome.NewContext(playwright.BrowserNewContextOptions{
|
||||
ExtraHttpHeaders: headers,
|
||||
UserAgent: &userAgent,
|
||||
ServiceWorkers: playwright.ServiceWorkerPolicyBlock,
|
||||
AcceptDownloads: playwright.Bool(false),
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("create browser context: %w", err)
|
||||
@ -131,6 +142,44 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
||||
}
|
||||
}()
|
||||
|
||||
if err := bCtx.Route("**", func(route playwright.Route) {
|
||||
log.Debugf("Route: %s", route.Request().URL())
|
||||
allowHost, err := e.allowHost(route.Request().URL())
|
||||
if err != nil {
|
||||
log.Errorf("Allow host: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
if allowHost {
|
||||
if err := route.Continue(); err != nil {
|
||||
log.Warnf("Route continue error: %v", err)
|
||||
}
|
||||
} else {
|
||||
if err := route.Abort(); err != nil {
|
||||
log.Warnf("Route abort error: %v", err)
|
||||
}
|
||||
}
|
||||
}); err != nil {
|
||||
return fmt.Errorf("set route: %w", err)
|
||||
}
|
||||
|
||||
if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) {
|
||||
log.Debugf("Websocket route: %s", route.URL())
|
||||
allowHost, err := e.allowHost(route.URL())
|
||||
if err != nil {
|
||||
log.Errorf("Allow host: %v", err)
|
||||
allowHost = false
|
||||
}
|
||||
if allowHost {
|
||||
if _, err := route.ConnectToServer(); err != nil {
|
||||
log.Warnf("Websocket connect error: %v", err)
|
||||
}
|
||||
} else {
|
||||
route.Close()
|
||||
}
|
||||
}); err != nil {
|
||||
return fmt.Errorf("websocket set route: %w", err)
|
||||
}
|
||||
|
||||
if len(cookies) > 0 {
|
||||
var pwCookies []playwright.OptionalCookie
|
||||
for _, cook := range cookies {
|
||||
@ -193,6 +242,22 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
||||
return err
|
||||
}
|
||||
|
||||
func (e *PwExtractor) allowHost(rawUrl string) (bool, error) {
|
||||
ips, err := getIPs(rawUrl)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("allow host get ips: %w", err)
|
||||
}
|
||||
for _, ip := range ips {
|
||||
deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast()
|
||||
deny = deny || e.proxyIP.Equal(ip)
|
||||
if deny {
|
||||
log.Debugf("Banned address: %s", rawUrl)
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
|
||||
errRet = e.visitPage(task, func(page playwright.Page) error {
|
||||
parser := pageParser{
|
||||
|
||||
@ -2,7 +2,9 @@ package pwextractor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/jellydator/ttlcache/v3"
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"net"
|
||||
"net/url"
|
||||
"slices"
|
||||
"strings"
|
||||
@ -66,3 +68,49 @@ func parseBaseDomain(urlStr string) (domain string, scheme string, err error) {
|
||||
}
|
||||
return fmt.Sprintf("%s.%s", domainParts[1], domainParts[0]), scheme, nil
|
||||
}
|
||||
|
||||
var dnsCache *ttlcache.Cache[string, []net.IP]
|
||||
|
||||
func init() {
|
||||
dnsCache = ttlcache.New[string, []net.IP](
|
||||
ttlcache.WithTTL[string, []net.IP](1*time.Minute),
|
||||
ttlcache.WithDisableTouchOnHit[string, []net.IP](),
|
||||
)
|
||||
go dnsCache.Start()
|
||||
}
|
||||
|
||||
// getIPs from url, hostname, ip string
|
||||
// result slice len always > 0 if error is nil
|
||||
func getIPs(host string) ([]net.IP, error) {
|
||||
ip := net.ParseIP(host)
|
||||
if ip != nil {
|
||||
return []net.IP{ip}, nil
|
||||
}
|
||||
|
||||
urlStruct, err := url.Parse(host)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("url parse: %w", err)
|
||||
}
|
||||
if len(urlStruct.Host) > 0 {
|
||||
host = urlStruct.Hostname()
|
||||
ip = net.ParseIP(host)
|
||||
if ip != nil {
|
||||
return []net.IP{ip}, nil
|
||||
}
|
||||
}
|
||||
|
||||
var ips []net.IP
|
||||
if dnsCache.Has(host) {
|
||||
ips = dnsCache.Get(host).Value()
|
||||
} else {
|
||||
ips, err = net.LookupIP(host)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("lookup ip: %w", err)
|
||||
}
|
||||
dnsCache.Set(host, ips, ttlcache.DefaultTTL)
|
||||
}
|
||||
if len(ips) == 0 {
|
||||
return nil, fmt.Errorf("lookip ip: not resolved")
|
||||
}
|
||||
return ips, nil
|
||||
}
|
||||
|
||||
109
internal/extractors/pwextractor/utils_test.go
Normal file
109
internal/extractors/pwextractor/utils_test.go
Normal file
@ -0,0 +1,109 @@
|
||||
package pwextractor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
var testDomain = "dns.google"
|
||||
var testResult = []string{
|
||||
"2001:4860:4860::8844", "2001:4860:4860::8888", "8.8.8.8", "8.8.4.4",
|
||||
}
|
||||
|
||||
func TestGetIPs(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expectedIPStrings []string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "valid IPv4",
|
||||
input: "192.0.2.1",
|
||||
expectedIPStrings: []string{"192.0.2.1"},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "valid IPv6",
|
||||
input: "2001:db8::1",
|
||||
expectedIPStrings: []string{"2001:db8::1"},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "URL with IPv4 host",
|
||||
input: "http://192.0.2.1",
|
||||
expectedIPStrings: []string{"192.0.2.1"},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "URL with IPv6 host",
|
||||
input: "http://[2001:db8::1]",
|
||||
expectedIPStrings: []string{"2001:db8::1"},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "URL with hostname",
|
||||
input: fmt.Sprintf("https://%s:8080", testDomain),
|
||||
expectedIPStrings: testResult,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "hostname",
|
||||
input: testDomain,
|
||||
expectedIPStrings: testResult,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "invalid IP address",
|
||||
input: "256.0.0.0",
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "empty input",
|
||||
input: "",
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "invalid URL format",
|
||||
input: "://invalid",
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "unresolvable hostname",
|
||||
input: "nonexistent.invalid",
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ips, err := getIPs(tc.input)
|
||||
|
||||
if tc.wantErr {
|
||||
require.Error(t, err)
|
||||
assert.Nil(t, ips)
|
||||
} else {
|
||||
require.NoError(t, err)
|
||||
require.NotEmpty(t, ips, "result slice should not be empty when error is nil")
|
||||
|
||||
if tc.expectedIPStrings != nil {
|
||||
ipStrings := make([]string, len(ips))
|
||||
for i, ip := range ips {
|
||||
ipStrings[i] = ip.String()
|
||||
}
|
||||
assert.ElementsMatch(t, tc.expectedIPStrings, ipStrings, "IPs do not match expected")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
t.Run("cache set", func(t *testing.T) {
|
||||
dnsCache.DeleteAll()
|
||||
require.False(t, dnsCache.Has(testDomain))
|
||||
ips, _ := getIPs(testDomain)
|
||||
require.True(t, dnsCache.Has(testDomain))
|
||||
require.ElementsMatch(t, ips, dnsCache.Get(testDomain).Value())
|
||||
})
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user