From cd4b160b4dd6f550eadfecfd5419f9c8e7a6f39d Mon Sep 17 00:00:00 2001 From: Egor Aristov Date: Wed, 19 Feb 2025 12:10:35 +0300 Subject: [PATCH] block local ips and service worker; util function to resolve arbitrary 'host-like' string, unit testing --- go.mod | 6 +- go.sum | 10 +- .../extractors/pwextractor/pwextractor.go | 65 +++++++++++ internal/extractors/pwextractor/utils.go | 48 ++++++++ internal/extractors/pwextractor/utils_test.go | 109 ++++++++++++++++++ 5 files changed, 233 insertions(+), 5 deletions(-) create mode 100644 internal/extractors/pwextractor/utils_test.go diff --git a/go.mod b/go.mod index 14db7f0..60841a5 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/go-redsync/redsync/v4 v4.8.1 github.com/gorilla/feeds v1.2.0 github.com/ilyakaznacheev/cleanenv v1.5.0 + github.com/jellydator/ttlcache/v3 v3.3.0 github.com/labstack/echo/v4 v4.13.3 github.com/labstack/gommon v0.4.2 github.com/markusmobius/go-dateparser v1.2.3 @@ -15,6 +16,7 @@ require ( github.com/nats-io/nats.go v1.38.0 github.com/playwright-community/playwright-go v0.5001.0 github.com/redis/go-redis/v9 v9.7.0 + github.com/stretchr/testify v1.10.0 golang.org/x/time v0.8.0 ) @@ -36,6 +38,7 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/coreos/go-semver v0.3.1 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/deckarep/golang-set/v2 v2.7.0 // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/elliotchance/pie/v2 v2.7.0 // indirect @@ -72,8 +75,8 @@ require ( github.com/nats-io/nkeys v0.4.9 // indirect github.com/nats-io/nuid v1.0.1 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/samuel/go-zookeeper v0.0.0-20201211165307-7117e9ea2414 // indirect - github.com/stretchr/objx v0.5.2 // indirect github.com/tetratelabs/wazero v1.2.1 // indirect github.com/thanhpk/randstr v1.0.4 // indirect github.com/valyala/bytebufferpool v1.0.0 // indirect @@ -88,6 +91,7 @@ require ( golang.org/x/crypto v0.32.0 // indirect golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 // indirect golang.org/x/net v0.34.0 // indirect + golang.org/x/sync v0.10.0 // indirect golang.org/x/sys v0.29.0 // indirect golang.org/x/text v0.21.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240814211410-ddb44dafa142 // indirect diff --git a/go.sum b/go.sum index 7ff408d..4cd7931 100644 --- a/go.sum +++ b/go.sum @@ -210,6 +210,8 @@ github.com/ilyakaznacheev/cleanenv v1.5.0 h1:0VNZXggJE2OYdXE87bfSSwGxeiGt9moSR2l github.com/ilyakaznacheev/cleanenv v1.5.0/go.mod h1:a5aDzaJrLCQZsazHol1w8InnDcOX0OColm64SlIi6gk= github.com/jalaali/go-jalaali v0.0.0-20210801064154-80525e88d958 h1:qxLoi6CAcXVzjfvu+KXIXJOAsQB62LXjsfbOaErsVzE= github.com/jalaali/go-jalaali v0.0.0-20210801064154-80525e88d958/go.mod h1:Wqfu7mjUHj9WDzSSPI5KfBclTTEnLveRUFr/ujWnTgE= +github.com/jellydator/ttlcache/v3 v3.3.0 h1:BdoC9cE81qXfrxeb9eoJi9dWrdhSuwXMAnHTbnBm4Wc= +github.com/jellydator/ttlcache/v3 v3.3.0/go.mod h1:bj2/e0l4jRnQdrnSTaGTsh4GSXvMjQcy41i7th0GVGw= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= @@ -300,8 +302,6 @@ github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/playwright-community/playwright-go v0.4901.0 h1:d+1KxF5PNAHZ0gTMQ9bPSyYRWii8soJ7Rt0gLWDejc4= -github.com/playwright-community/playwright-go v0.4901.0/go.mod h1:kBNWs/w2aJ2ZUp1wEOOFLXgOqvppFngM5OS+qyhl+ZM= github.com/playwright-community/playwright-go v0.5001.0 h1:EY3oB+rU9cUp6CLHguWE8VMZTwAg+83Yyb7dQqEmGLg= github.com/playwright-community/playwright-go v0.5001.0/go.mod h1:kBNWs/w2aJ2ZUp1wEOOFLXgOqvppFngM5OS+qyhl+ZM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -375,8 +375,8 @@ go.etcd.io/etcd/client/v3 v3.5.17 h1:o48sINNeWz5+pjy/Z0+HKpj/xSnBkuVhVvXkjEXbqZY go.etcd.io/etcd/client/v3 v3.5.17/go.mod h1:j2d4eXTHWkT2ClBgnnEPm/Wuu7jsqku41v9DZ3OtjQo= go.uber.org/atomic v1.10.0 h1:9qC72Qh0+3MqyJbAn8YU5xVq1frD8bn3JtD2oXtafVQ= go.uber.org/atomic v1.10.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= -go.uber.org/goleak v1.1.11 h1:wy28qYRKZgnJTxGxvye5/wgWr1EKjmUDGYox5mGlRlI= -go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= @@ -423,6 +423,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= diff --git a/internal/extractors/pwextractor/pwextractor.go b/internal/extractors/pwextractor/pwextractor.go index eb5ba9f..1876b18 100644 --- a/internal/extractors/pwextractor/pwextractor.go +++ b/internal/extractors/pwextractor/pwextractor.go @@ -8,6 +8,7 @@ import ( "github.com/labstack/gommon/log" "github.com/playwright-community/playwright-go" "maps" + "net" "strings" "time" ) @@ -30,6 +31,7 @@ type PwExtractor struct { dateParser DateParser cookieManager CookieManager limiter limiter.Limiter + proxyIP net.IP } type Config struct { @@ -50,6 +52,13 @@ func New(cfg Config) (*PwExtractor, error) { if err != nil { return nil, fmt.Errorf("parse proxy: %w", err) } + if proxy != nil { + proxyIPs, err := getIPs(proxy.Server) + if err != nil { + return nil, fmt.Errorf("get proxy ip: %w", err) + } + e.proxyIP = proxyIPs[0] + } e.chrome, err = e.pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{ Args: []string{ "--webrtc-ip-handling-policy=disable_non_proxied_udp", @@ -121,6 +130,8 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) bCtx, err := e.chrome.NewContext(playwright.BrowserNewContextOptions{ ExtraHttpHeaders: headers, UserAgent: &userAgent, + ServiceWorkers: playwright.ServiceWorkerPolicyBlock, + AcceptDownloads: playwright.Bool(false), }) if err != nil { return fmt.Errorf("create browser context: %w", err) @@ -131,6 +142,44 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) } }() + if err := bCtx.Route("**", func(route playwright.Route) { + log.Debugf("Route: %s", route.Request().URL()) + allowHost, err := e.allowHost(route.Request().URL()) + if err != nil { + log.Errorf("Allow host: %v", err) + allowHost = false + } + if allowHost { + if err := route.Continue(); err != nil { + log.Warnf("Route continue error: %v", err) + } + } else { + if err := route.Abort(); err != nil { + log.Warnf("Route abort error: %v", err) + } + } + }); err != nil { + return fmt.Errorf("set route: %w", err) + } + + if err := bCtx.RouteWebSocket("**", func(route playwright.WebSocketRoute) { + log.Debugf("Websocket route: %s", route.URL()) + allowHost, err := e.allowHost(route.URL()) + if err != nil { + log.Errorf("Allow host: %v", err) + allowHost = false + } + if allowHost { + if _, err := route.ConnectToServer(); err != nil { + log.Warnf("Websocket connect error: %v", err) + } + } else { + route.Close() + } + }); err != nil { + return fmt.Errorf("websocket set route: %w", err) + } + if len(cookies) > 0 { var pwCookies []playwright.OptionalCookie for _, cook := range cookies { @@ -193,6 +242,22 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) return err } +func (e *PwExtractor) allowHost(rawUrl string) (bool, error) { + ips, err := getIPs(rawUrl) + if err != nil { + return false, fmt.Errorf("allow host get ips: %w", err) + } + for _, ip := range ips { + deny := ip.IsPrivate() || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || ip.IsMulticast() + deny = deny || e.proxyIP.Equal(ip) + if deny { + log.Debugf("Banned address: %s", rawUrl) + return false, nil + } + } + return true, nil +} + func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) { errRet = e.visitPage(task, func(page playwright.Page) error { parser := pageParser{ diff --git a/internal/extractors/pwextractor/utils.go b/internal/extractors/pwextractor/utils.go index 7a83762..bfe4d7f 100644 --- a/internal/extractors/pwextractor/utils.go +++ b/internal/extractors/pwextractor/utils.go @@ -2,7 +2,9 @@ package pwextractor import ( "fmt" + "github.com/jellydator/ttlcache/v3" "github.com/playwright-community/playwright-go" + "net" "net/url" "slices" "strings" @@ -66,3 +68,49 @@ func parseBaseDomain(urlStr string) (domain string, scheme string, err error) { } return fmt.Sprintf("%s.%s", domainParts[1], domainParts[0]), scheme, nil } + +var dnsCache *ttlcache.Cache[string, []net.IP] + +func init() { + dnsCache = ttlcache.New[string, []net.IP]( + ttlcache.WithTTL[string, []net.IP](1*time.Minute), + ttlcache.WithDisableTouchOnHit[string, []net.IP](), + ) + go dnsCache.Start() +} + +// getIPs from url, hostname, ip string +// result slice len always > 0 if error is nil +func getIPs(host string) ([]net.IP, error) { + ip := net.ParseIP(host) + if ip != nil { + return []net.IP{ip}, nil + } + + urlStruct, err := url.Parse(host) + if err != nil { + return nil, fmt.Errorf("url parse: %w", err) + } + if len(urlStruct.Host) > 0 { + host = urlStruct.Hostname() + ip = net.ParseIP(host) + if ip != nil { + return []net.IP{ip}, nil + } + } + + var ips []net.IP + if dnsCache.Has(host) { + ips = dnsCache.Get(host).Value() + } else { + ips, err = net.LookupIP(host) + if err != nil { + return nil, fmt.Errorf("lookup ip: %w", err) + } + dnsCache.Set(host, ips, ttlcache.DefaultTTL) + } + if len(ips) == 0 { + return nil, fmt.Errorf("lookip ip: not resolved") + } + return ips, nil +} diff --git a/internal/extractors/pwextractor/utils_test.go b/internal/extractors/pwextractor/utils_test.go new file mode 100644 index 0000000..e347f42 --- /dev/null +++ b/internal/extractors/pwextractor/utils_test.go @@ -0,0 +1,109 @@ +package pwextractor + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var testDomain = "dns.google" +var testResult = []string{ + "2001:4860:4860::8844", "2001:4860:4860::8888", "8.8.8.8", "8.8.4.4", +} + +func TestGetIPs(t *testing.T) { + tests := []struct { + name string + input string + expectedIPStrings []string + wantErr bool + }{ + { + name: "valid IPv4", + input: "192.0.2.1", + expectedIPStrings: []string{"192.0.2.1"}, + wantErr: false, + }, + { + name: "valid IPv6", + input: "2001:db8::1", + expectedIPStrings: []string{"2001:db8::1"}, + wantErr: false, + }, + { + name: "URL with IPv4 host", + input: "http://192.0.2.1", + expectedIPStrings: []string{"192.0.2.1"}, + wantErr: false, + }, + { + name: "URL with IPv6 host", + input: "http://[2001:db8::1]", + expectedIPStrings: []string{"2001:db8::1"}, + wantErr: false, + }, + { + name: "URL with hostname", + input: fmt.Sprintf("https://%s:8080", testDomain), + expectedIPStrings: testResult, + wantErr: false, + }, + { + name: "hostname", + input: testDomain, + expectedIPStrings: testResult, + wantErr: false, + }, + { + name: "invalid IP address", + input: "256.0.0.0", + wantErr: true, + }, + { + name: "empty input", + input: "", + wantErr: true, + }, + { + name: "invalid URL format", + input: "://invalid", + wantErr: true, + }, + { + name: "unresolvable hostname", + input: "nonexistent.invalid", + wantErr: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ips, err := getIPs(tc.input) + + if tc.wantErr { + require.Error(t, err) + assert.Nil(t, ips) + } else { + require.NoError(t, err) + require.NotEmpty(t, ips, "result slice should not be empty when error is nil") + + if tc.expectedIPStrings != nil { + ipStrings := make([]string, len(ips)) + for i, ip := range ips { + ipStrings[i] = ip.String() + } + assert.ElementsMatch(t, tc.expectedIPStrings, ipStrings, "IPs do not match expected") + } + } + }) + } + t.Run("cache set", func(t *testing.T) { + dnsCache.DeleteAll() + require.False(t, dnsCache.Has(testDomain)) + ips, _ := getIPs(testDomain) + require.True(t, dnsCache.Has(testDomain)) + require.ElementsMatch(t, ips, dnsCache.Get(testDomain).Value()) + }) +}