From 9bfe51f9c8d806e435886f1f44559490994ee97d Mon Sep 17 00:00:00 2001 From: Egor Aristov Date: Sat, 18 Jan 2025 12:45:04 +0300 Subject: [PATCH] proxy support; config validation --- .gitignore | 2 + cmd/extractor/extractor.go | 17 ++++++-- cmd/webserver/webserver.go | 11 +---- cmd/worker/worker.go | 14 ++----- internal/config/config.go | 40 +++++++++++++++++++ .../extractors/pwextractor/pwextractor.go | 8 +++- internal/extractors/pwextractor/utils.go | 21 ++++++++++ 7 files changed, 89 insertions(+), 24 deletions(-) create mode 100644 internal/config/config.go diff --git a/.gitignore b/.gitignore index 5d88c84..1703b2c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /.idea/ /trash/ +/todo.md +/.env.dev diff --git a/cmd/extractor/extractor.go b/cmd/extractor/extractor.go index 888015e..0dfd5ea 100644 --- a/cmd/extractor/extractor.go +++ b/cmd/extractor/extractor.go @@ -1,6 +1,7 @@ package main import ( + "github.com/egor3f/rssalchemy/internal/config" "github.com/egor3f/rssalchemy/internal/extractors/pwextractor" "github.com/egor3f/rssalchemy/internal/models" "github.com/labstack/gommon/log" @@ -11,10 +12,13 @@ func main() { log.SetLevel(log.DEBUG) log.SetHeader(`${level}`) + // this code is temporary! + // todo: rewrite not to use hardcoded tasks + task := models.Task{ - URL: "https://vombat.su", - SelectorPost: "div.post-body", - SelectorTitle: "h1 a", + URL: "https://2ip.ru", + SelectorPost: "div.ip", + SelectorTitle: "span", SelectorLink: "h1 a", SelectorDescription: "div.post-content-block p", SelectorAuthor: "a:has(> span.post-author)", @@ -23,7 +27,12 @@ func main() { SelectorEnclosure: "article img.object-contain", } - pwe, err := pwextractor.New() + cfg, err := config.Read() + if err != nil { + log.Panicf("read config: %v", err) + } + + pwe, err := pwextractor.New(cfg) if err != nil { log.Panicf("create pw extractor: %v", err) } diff --git a/cmd/webserver/webserver.go b/cmd/webserver/webserver.go index 88e5f09..28d2121 100644 --- a/cmd/webserver/webserver.go +++ b/cmd/webserver/webserver.go @@ -8,11 +8,11 @@ import ( "encoding/json" "fmt" "github.com/egor3f/rssalchemy/internal/adapters/natsadapter" + "github.com/egor3f/rssalchemy/internal/config" "github.com/egor3f/rssalchemy/internal/models" "github.com/ericchiang/css" "github.com/go-playground/validator/v10" "github.com/gorilla/feeds" - "github.com/ilyakaznacheev/cleanenv" "github.com/labstack/echo/v4" "github.com/labstack/echo/v4/middleware" "github.com/labstack/gommon/log" @@ -26,12 +26,6 @@ import ( "time" ) -type Config struct { - WebserverAddress string `yaml:"webserver_address" env:"WEBSERVER_ADDRESS" env-required:"true"` - NatsUrl string `yaml:"nats_url" env:"NATS_URL" env-required:"true"` - Debug bool `yaml:"debug" env:"DEBUG"` -} - type Specs struct { URL string `json:"URL" validate:"url"` SelectorPost string `json:"selector_post" validate:"selector"` @@ -46,8 +40,7 @@ type Specs struct { } func main() { - var cfg Config - err := cleanenv.ReadConfig("config.yml", &cfg) + cfg, err := config.Read() if err != nil { log.Panicf("reading config failed: %v", err) } diff --git a/cmd/worker/worker.go b/cmd/worker/worker.go index e5e08bf..69f2c40 100644 --- a/cmd/worker/worker.go +++ b/cmd/worker/worker.go @@ -5,25 +5,19 @@ import ( "encoding/json" "fmt" "github.com/egor3f/rssalchemy/internal/adapters/natsadapter" + "github.com/egor3f/rssalchemy/internal/config" "github.com/egor3f/rssalchemy/internal/extractors/pwextractor" "github.com/egor3f/rssalchemy/internal/models" - "github.com/ilyakaznacheev/cleanenv" "github.com/labstack/gommon/log" "github.com/nats-io/nats.go" "os" "os/signal" ) -type Config struct { - NatsUrl string `yaml:"nats_url" env:"NATS_URL" env-required:"true"` - Debug bool `yaml:"debug" env:"DEBUG"` -} - func main() { - var cfg Config - err := cleanenv.ReadConfig("config.yml", &cfg) + cfg, err := config.Read() if err != nil { - log.Panicf("reading config failed: %w", err) + log.Panicf("reading config failed: %v", err) } if cfg.Debug { @@ -53,7 +47,7 @@ func main() { log.Panicf("create nats adapter: %v", err) } - pwe, err := pwextractor.New() + pwe, err := pwextractor.New(cfg) if err != nil { log.Panicf("create pw extractor: %v", err) } diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..04f4d75 --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,40 @@ +package config + +import ( + "fmt" + "github.com/go-playground/validator/v10" + "github.com/ilyakaznacheev/cleanenv" + "net/url" + "reflect" + "slices" +) + +type Config struct { + WebserverAddress string `yaml:"webserver_address" env:"WEBSERVER_ADDRESS" env-required:"true" validate:"hostname_port"` + NatsUrl string `yaml:"nats_url" env:"NATS_URL" env-required:"true" validate:"url"` + Debug bool `yaml:"debug" env:"DEBUG"` + Proxy string `yaml:"proxy" env:"PROXY" env-default:"" validate:"omitempty,proxy"` +} + +func Read() (Config, error) { + var cfg Config + err := cleanenv.ReadConfig("config.yml", &cfg) + if err != nil { + return Config{}, err + } + validate := validator.New() + if err := validate.RegisterValidation("proxy", validateProxy); err != nil { + panic(fmt.Errorf("register validation: %w", err)) + } + err = validate.Struct(cfg) + return cfg, err +} + +func validateProxy(fl validator.FieldLevel) bool { + if fl.Field().Kind() != reflect.String { + return false + } + validSchemes := []string{"http", "https", "socks"} + pUrl, err := url.Parse(fl.Field().String()) + return err == nil && slices.Contains(validSchemes, pUrl.Scheme) && pUrl.Opaque == "" && pUrl.Path == "" +} diff --git a/internal/extractors/pwextractor/pwextractor.go b/internal/extractors/pwextractor/pwextractor.go index f925c61..78e2cc9 100644 --- a/internal/extractors/pwextractor/pwextractor.go +++ b/internal/extractors/pwextractor/pwextractor.go @@ -3,6 +3,7 @@ package pwextractor import ( _ "embed" "fmt" + "github.com/egor3f/rssalchemy/internal/config" "github.com/egor3f/rssalchemy/internal/models" "github.com/labstack/gommon/log" "github.com/markusmobius/go-dateparser" @@ -23,16 +24,21 @@ type PwExtractor struct { chrome playwright.Browser } -func New() (*PwExtractor, error) { +func New(cfg config.Config) (*PwExtractor, error) { e := PwExtractor{} var err error e.pw, err = playwright.Run() if err != nil { return nil, fmt.Errorf("run playwright: %w", err) } + proxy, err := parseProxy(cfg.Proxy) + if err != nil { + return nil, fmt.Errorf("parse proxy: %w", err) + } e.chrome, err = e.pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{ ChromiumSandbox: playwright.Bool(true), HandleSIGINT: playwright.Bool(false), + Proxy: proxy, Timeout: pwDuration("5s"), }) if err != nil { diff --git a/internal/extractors/pwextractor/utils.go b/internal/extractors/pwextractor/utils.go index 431874f..840c478 100644 --- a/internal/extractors/pwextractor/utils.go +++ b/internal/extractors/pwextractor/utils.go @@ -30,3 +30,24 @@ func pwDuration(s string) *float64 { f64 := float64(dur.Milliseconds()) return &f64 } + +func parseProxy(s string) (*playwright.Proxy, error) { + var proxy *playwright.Proxy + if len(s) > 0 { + proxyUrl, err := url.Parse(s) + if err != nil { + return nil, err + } + urlWithoutUser := *proxyUrl + urlWithoutUser.User = nil + proxy = &playwright.Proxy{Server: urlWithoutUser.String()} + if proxyUrl.User != nil { + user := proxyUrl.User.Username() + proxy.Username = &user + if pass, exist := proxyUrl.User.Password(); exist { + proxy.Password = &pass + } + } + } + return proxy, nil +}