headers and cookies support

This commit is contained in:
Egor Aristov 2025-01-23 14:37:50 +03:00
parent ab4966052b
commit 5ff8b7e6d0
4 changed files with 109 additions and 9 deletions

View File

@ -126,6 +126,7 @@ func (h *Handler) handlePageScreenshot(c echo.Context) error {
task := models.Task{ task := models.Task{
TaskType: models.TaskTypePageScreenshot, TaskType: models.TaskTypePageScreenshot,
URL: pageUrl, URL: pageUrl,
Headers: extractHeaders(c),
} }
timeoutCtx, cancel := context.WithTimeout(context.Background(), taskTimeout) timeoutCtx, cancel := context.WithTimeout(context.Background(), taskTimeout)
@ -201,3 +202,13 @@ func makeFeed(task models.Task, result models.TaskResult) (string, error) {
} }
return atom, nil return atom, nil
} }
func extractHeaders(c echo.Context) map[string]string {
headers := make(map[string]string)
for _, hName := range []string{"Accept-Language", "Cookie"} {
if len(c.Request().Header.Get(hName)) > 0 {
headers[hName] = c.Request().Header.Get(hName)
}
}
return headers
}

View File

@ -9,6 +9,7 @@ import (
"github.com/labstack/gommon/log" "github.com/labstack/gommon/log"
"github.com/markusmobius/go-dateparser" "github.com/markusmobius/go-dateparser"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"maps"
) )
// Timeouts // Timeouts
@ -20,6 +21,9 @@ var (
defOptEval = playwright.LocatorEvaluateOptions{Timeout: pwDuration(defTimeout)} defOptEval = playwright.LocatorEvaluateOptions{Timeout: pwDuration(defTimeout)}
) )
var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"`
type PwExtractor struct { type PwExtractor struct {
pw *playwright.Playwright pw *playwright.Playwright
chrome playwright.Browser chrome playwright.Browser
@ -58,30 +62,82 @@ func (e *PwExtractor) Stop() error {
return nil return nil
} }
func (e *PwExtractor) visitPage(pageUrl string, cb func(page playwright.Page) error) (errRet error) { func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) error) (errRet error) {
page, err := e.chrome.NewPage() headers := maps.Clone(task.Headers)
headers["Sec-Ch-Ua"] = secChUa
var cookieStr string
if v, ok := headers["Cookie"]; ok {
cookieStr = v
delete(headers, "Cookie")
}
bCtx, err := e.chrome.NewContext(playwright.BrowserNewContextOptions{
ExtraHttpHeaders: headers,
UserAgent: &userAgent,
})
if err != nil {
return fmt.Errorf("create browser context: %w", err)
}
defer func() {
if err := bCtx.Close(); err != nil {
errRet = fmt.Errorf("close context: %w; other error=%w", err, errRet)
}
}()
if len(cookieStr) > 0 {
cookies, err := parseCookieString(cookieStr)
if err != nil {
return fmt.Errorf("parsing cookies: %w", err)
}
baseDomain, err := parseBaseDomain(task.URL)
if err != nil {
return fmt.Errorf("parse base domain: %w", err)
}
var pwCookies []playwright.OptionalCookie
for k, v := range cookies {
pwCookies = append(pwCookies, playwright.OptionalCookie{
Name: k,
Value: v,
Domain: playwright.String(fmt.Sprintf(".%s", baseDomain)),
Path: playwright.String("/"),
})
}
if err := bCtx.AddCookies(pwCookies); err != nil {
return fmt.Errorf("add cookies: %w", err)
}
}
page, err := bCtx.NewPage()
if err != nil { if err != nil {
return fmt.Errorf("browser new page: %w", err) return fmt.Errorf("browser new page: %w", err)
} }
defer func() { defer func() {
err := page.Close() if err := page.Close(); err != nil {
if err != nil {
errRet = fmt.Errorf("close page: %w; other error=%w", err, errRet) errRet = fmt.Errorf("close page: %w; other error=%w", err, errRet)
} }
}() }()
log.Debugf("Page opened") log.Debugf("Page opened")
if _, err := page.Goto(pageUrl); err != nil { if len(task.Headers) > 0 {
if err := page.SetExtraHTTPHeaders(task.Headers); err != nil {
return fmt.Errorf("set headers: %w", err)
}
}
if _, err := page.Goto(task.URL, playwright.PageGotoOptions{Timeout: pwDuration("10s")}); err != nil {
return fmt.Errorf("goto page: %w", err) return fmt.Errorf("goto page: %w", err)
} }
log.Debugf("Url %s visited", pageUrl) log.Debugf("Url %s visited", task.URL)
defer log.Debugf("Visiting page %s finished", pageUrl) defer log.Debugf("Visiting page %s finished", task.URL)
return cb(page) return cb(page)
} }
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) { func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
errRet = e.visitPage(task.URL, func(page playwright.Page) error { errRet = e.visitPage(task, func(page playwright.Page) error {
parser := pageParser{ parser := pageParser{
task: task, task: task,
page: page, page: page,
@ -100,7 +156,7 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
} }
func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTaskResult, errRet error) { func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTaskResult, errRet error) {
errRet = e.visitPage(task.URL, func(page playwright.Page) error { errRet = e.visitPage(task, func(page playwright.Page) error {
err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
State: playwright.LoadStateNetworkidle, State: playwright.LoadStateNetworkidle,
Timeout: pwDuration("5s"), Timeout: pwDuration("5s"),

View File

@ -4,6 +4,7 @@ import (
"fmt" "fmt"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"net/url" "net/url"
"slices"
"strings" "strings"
"time" "time"
) )
@ -51,3 +52,33 @@ func parseProxy(s string) (*playwright.Proxy, error) {
} }
return proxy, nil return proxy, nil
} }
func parseBaseDomain(urlStr string) (string, error) {
pageUrl, err := url.Parse(urlStr)
if err != nil {
return "", fmt.Errorf("task url parsing: %w", err)
}
domainParts := strings.Split(pageUrl.Host, ".")
slices.Reverse(domainParts) // com, example, www
return fmt.Sprintf("%s.%s", domainParts[1], domainParts[0]), nil
}
func parseCookieString(cookieStr string) (map[string]string, error) {
result := make(map[string]string)
failed := fmt.Errorf("failed to parse cookies")
for _, cook := range strings.Split(cookieStr, ";") {
kv := strings.Split(cook, "=")
if len(kv) != 2 {
return nil, failed
}
k, err1 := url.QueryUnescape(kv[0])
v, err2 := url.QueryUnescape(kv[1])
if err1 != nil || err2 != nil {
return nil, failed
}
result[k] = v
}
return result, nil
}

View File

@ -25,6 +25,7 @@ type Task struct {
SelectorCreated string SelectorCreated string
SelectorContent string SelectorContent string
SelectorEnclosure string SelectorEnclosure string
Headers map[string]string
} }
func (t Task) CacheKey() string { func (t Task) CacheKey() string {
@ -38,6 +39,7 @@ func (t Task) CacheKey() string {
h.Write([]byte(t.SelectorCreated)) h.Write([]byte(t.SelectorCreated))
h.Write([]byte(t.SelectorContent)) h.Write([]byte(t.SelectorContent))
h.Write([]byte(t.SelectorEnclosure)) h.Write([]byte(t.SelectorEnclosure))
h.Write([]byte(fmt.Sprintf("%+v", t.Headers)))
return fmt.Sprintf("%s_%x", t.TaskType, h.Sum(nil)) return fmt.Sprintf("%s_%x", t.TaskType, h.Sum(nil))
} }