From 5ff8b7e6d01cacaf7a91b0021dc86a6c745c5088 Mon Sep 17 00:00:00 2001 From: Egor Aristov Date: Thu, 23 Jan 2025 14:37:50 +0300 Subject: [PATCH] headers and cookies support --- internal/delivery/http/handler.go | 11 +++ .../extractors/pwextractor/pwextractor.go | 74 ++++++++++++++++--- internal/extractors/pwextractor/utils.go | 31 ++++++++ internal/models/models.go | 2 + 4 files changed, 109 insertions(+), 9 deletions(-) diff --git a/internal/delivery/http/handler.go b/internal/delivery/http/handler.go index 9a33820..2250d4f 100644 --- a/internal/delivery/http/handler.go +++ b/internal/delivery/http/handler.go @@ -126,6 +126,7 @@ func (h *Handler) handlePageScreenshot(c echo.Context) error { task := models.Task{ TaskType: models.TaskTypePageScreenshot, URL: pageUrl, + Headers: extractHeaders(c), } timeoutCtx, cancel := context.WithTimeout(context.Background(), taskTimeout) @@ -201,3 +202,13 @@ func makeFeed(task models.Task, result models.TaskResult) (string, error) { } return atom, nil } + +func extractHeaders(c echo.Context) map[string]string { + headers := make(map[string]string) + for _, hName := range []string{"Accept-Language", "Cookie"} { + if len(c.Request().Header.Get(hName)) > 0 { + headers[hName] = c.Request().Header.Get(hName) + } + } + return headers +} diff --git a/internal/extractors/pwextractor/pwextractor.go b/internal/extractors/pwextractor/pwextractor.go index 99885a5..1773fc7 100644 --- a/internal/extractors/pwextractor/pwextractor.go +++ b/internal/extractors/pwextractor/pwextractor.go @@ -9,6 +9,7 @@ import ( "github.com/labstack/gommon/log" "github.com/markusmobius/go-dateparser" "github.com/playwright-community/playwright-go" + "maps" ) // Timeouts @@ -20,6 +21,9 @@ var ( defOptEval = playwright.LocatorEvaluateOptions{Timeout: pwDuration(defTimeout)} ) +var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36" +var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"` + type PwExtractor struct { pw *playwright.Playwright chrome playwright.Browser @@ -58,30 +62,82 @@ func (e *PwExtractor) Stop() error { return nil } -func (e *PwExtractor) visitPage(pageUrl string, cb func(page playwright.Page) error) (errRet error) { - page, err := e.chrome.NewPage() +func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) error) (errRet error) { + headers := maps.Clone(task.Headers) + headers["Sec-Ch-Ua"] = secChUa + var cookieStr string + if v, ok := headers["Cookie"]; ok { + cookieStr = v + delete(headers, "Cookie") + } + + bCtx, err := e.chrome.NewContext(playwright.BrowserNewContextOptions{ + ExtraHttpHeaders: headers, + UserAgent: &userAgent, + }) + if err != nil { + return fmt.Errorf("create browser context: %w", err) + } + defer func() { + if err := bCtx.Close(); err != nil { + errRet = fmt.Errorf("close context: %w; other error=%w", err, errRet) + } + }() + + if len(cookieStr) > 0 { + cookies, err := parseCookieString(cookieStr) + if err != nil { + return fmt.Errorf("parsing cookies: %w", err) + } + + baseDomain, err := parseBaseDomain(task.URL) + if err != nil { + return fmt.Errorf("parse base domain: %w", err) + } + + var pwCookies []playwright.OptionalCookie + for k, v := range cookies { + pwCookies = append(pwCookies, playwright.OptionalCookie{ + Name: k, + Value: v, + Domain: playwright.String(fmt.Sprintf(".%s", baseDomain)), + Path: playwright.String("/"), + }) + } + + if err := bCtx.AddCookies(pwCookies); err != nil { + return fmt.Errorf("add cookies: %w", err) + } + } + + page, err := bCtx.NewPage() if err != nil { return fmt.Errorf("browser new page: %w", err) } defer func() { - err := page.Close() - if err != nil { + if err := page.Close(); err != nil { errRet = fmt.Errorf("close page: %w; other error=%w", err, errRet) } }() log.Debugf("Page opened") - if _, err := page.Goto(pageUrl); err != nil { + if len(task.Headers) > 0 { + if err := page.SetExtraHTTPHeaders(task.Headers); err != nil { + return fmt.Errorf("set headers: %w", err) + } + } + + if _, err := page.Goto(task.URL, playwright.PageGotoOptions{Timeout: pwDuration("10s")}); err != nil { return fmt.Errorf("goto page: %w", err) } - log.Debugf("Url %s visited", pageUrl) - defer log.Debugf("Visiting page %s finished", pageUrl) + log.Debugf("Url %s visited", task.URL) + defer log.Debugf("Visiting page %s finished", task.URL) return cb(page) } func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) { - errRet = e.visitPage(task.URL, func(page playwright.Page) error { + errRet = e.visitPage(task, func(page playwright.Page) error { parser := pageParser{ task: task, page: page, @@ -100,7 +156,7 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR } func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTaskResult, errRet error) { - errRet = e.visitPage(task.URL, func(page playwright.Page) error { + errRet = e.visitPage(task, func(page playwright.Page) error { err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ State: playwright.LoadStateNetworkidle, Timeout: pwDuration("5s"), diff --git a/internal/extractors/pwextractor/utils.go b/internal/extractors/pwextractor/utils.go index 840c478..6eac213 100644 --- a/internal/extractors/pwextractor/utils.go +++ b/internal/extractors/pwextractor/utils.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/playwright-community/playwright-go" "net/url" + "slices" "strings" "time" ) @@ -51,3 +52,33 @@ func parseProxy(s string) (*playwright.Proxy, error) { } return proxy, nil } + +func parseBaseDomain(urlStr string) (string, error) { + pageUrl, err := url.Parse(urlStr) + if err != nil { + return "", fmt.Errorf("task url parsing: %w", err) + } + domainParts := strings.Split(pageUrl.Host, ".") + slices.Reverse(domainParts) // com, example, www + return fmt.Sprintf("%s.%s", domainParts[1], domainParts[0]), nil +} + +func parseCookieString(cookieStr string) (map[string]string, error) { + result := make(map[string]string) + failed := fmt.Errorf("failed to parse cookies") + + for _, cook := range strings.Split(cookieStr, ";") { + kv := strings.Split(cook, "=") + if len(kv) != 2 { + return nil, failed + } + k, err1 := url.QueryUnescape(kv[0]) + v, err2 := url.QueryUnescape(kv[1]) + if err1 != nil || err2 != nil { + return nil, failed + } + result[k] = v + } + + return result, nil +} diff --git a/internal/models/models.go b/internal/models/models.go index 689dbe9..7099d92 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -25,6 +25,7 @@ type Task struct { SelectorCreated string SelectorContent string SelectorEnclosure string + Headers map[string]string } func (t Task) CacheKey() string { @@ -38,6 +39,7 @@ func (t Task) CacheKey() string { h.Write([]byte(t.SelectorCreated)) h.Write([]byte(t.SelectorContent)) h.Write([]byte(t.SelectorEnclosure)) + h.Write([]byte(fmt.Sprintf("%+v", t.Headers))) return fmt.Sprintf("%s_%x", t.TaskType, h.Sum(nil)) }