headers and cookies support
This commit is contained in:
parent
ab4966052b
commit
5ff8b7e6d0
@ -126,6 +126,7 @@ func (h *Handler) handlePageScreenshot(c echo.Context) error {
|
||||
task := models.Task{
|
||||
TaskType: models.TaskTypePageScreenshot,
|
||||
URL: pageUrl,
|
||||
Headers: extractHeaders(c),
|
||||
}
|
||||
|
||||
timeoutCtx, cancel := context.WithTimeout(context.Background(), taskTimeout)
|
||||
@ -201,3 +202,13 @@ func makeFeed(task models.Task, result models.TaskResult) (string, error) {
|
||||
}
|
||||
return atom, nil
|
||||
}
|
||||
|
||||
func extractHeaders(c echo.Context) map[string]string {
|
||||
headers := make(map[string]string)
|
||||
for _, hName := range []string{"Accept-Language", "Cookie"} {
|
||||
if len(c.Request().Header.Get(hName)) > 0 {
|
||||
headers[hName] = c.Request().Header.Get(hName)
|
||||
}
|
||||
}
|
||||
return headers
|
||||
}
|
||||
|
||||
@ -9,6 +9,7 @@ import (
|
||||
"github.com/labstack/gommon/log"
|
||||
"github.com/markusmobius/go-dateparser"
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"maps"
|
||||
)
|
||||
|
||||
// Timeouts
|
||||
@ -20,6 +21,9 @@ var (
|
||||
defOptEval = playwright.LocatorEvaluateOptions{Timeout: pwDuration(defTimeout)}
|
||||
)
|
||||
|
||||
var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
|
||||
var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"`
|
||||
|
||||
type PwExtractor struct {
|
||||
pw *playwright.Playwright
|
||||
chrome playwright.Browser
|
||||
@ -58,30 +62,82 @@ func (e *PwExtractor) Stop() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e *PwExtractor) visitPage(pageUrl string, cb func(page playwright.Page) error) (errRet error) {
|
||||
page, err := e.chrome.NewPage()
|
||||
func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) error) (errRet error) {
|
||||
headers := maps.Clone(task.Headers)
|
||||
headers["Sec-Ch-Ua"] = secChUa
|
||||
var cookieStr string
|
||||
if v, ok := headers["Cookie"]; ok {
|
||||
cookieStr = v
|
||||
delete(headers, "Cookie")
|
||||
}
|
||||
|
||||
bCtx, err := e.chrome.NewContext(playwright.BrowserNewContextOptions{
|
||||
ExtraHttpHeaders: headers,
|
||||
UserAgent: &userAgent,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("create browser context: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := bCtx.Close(); err != nil {
|
||||
errRet = fmt.Errorf("close context: %w; other error=%w", err, errRet)
|
||||
}
|
||||
}()
|
||||
|
||||
if len(cookieStr) > 0 {
|
||||
cookies, err := parseCookieString(cookieStr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parsing cookies: %w", err)
|
||||
}
|
||||
|
||||
baseDomain, err := parseBaseDomain(task.URL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse base domain: %w", err)
|
||||
}
|
||||
|
||||
var pwCookies []playwright.OptionalCookie
|
||||
for k, v := range cookies {
|
||||
pwCookies = append(pwCookies, playwright.OptionalCookie{
|
||||
Name: k,
|
||||
Value: v,
|
||||
Domain: playwright.String(fmt.Sprintf(".%s", baseDomain)),
|
||||
Path: playwright.String("/"),
|
||||
})
|
||||
}
|
||||
|
||||
if err := bCtx.AddCookies(pwCookies); err != nil {
|
||||
return fmt.Errorf("add cookies: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
page, err := bCtx.NewPage()
|
||||
if err != nil {
|
||||
return fmt.Errorf("browser new page: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
err := page.Close()
|
||||
if err != nil {
|
||||
if err := page.Close(); err != nil {
|
||||
errRet = fmt.Errorf("close page: %w; other error=%w", err, errRet)
|
||||
}
|
||||
}()
|
||||
log.Debugf("Page opened")
|
||||
|
||||
if _, err := page.Goto(pageUrl); err != nil {
|
||||
if len(task.Headers) > 0 {
|
||||
if err := page.SetExtraHTTPHeaders(task.Headers); err != nil {
|
||||
return fmt.Errorf("set headers: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if _, err := page.Goto(task.URL, playwright.PageGotoOptions{Timeout: pwDuration("10s")}); err != nil {
|
||||
return fmt.Errorf("goto page: %w", err)
|
||||
}
|
||||
log.Debugf("Url %s visited", pageUrl)
|
||||
defer log.Debugf("Visiting page %s finished", pageUrl)
|
||||
log.Debugf("Url %s visited", task.URL)
|
||||
defer log.Debugf("Visiting page %s finished", task.URL)
|
||||
|
||||
return cb(page)
|
||||
}
|
||||
|
||||
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
|
||||
errRet = e.visitPage(task.URL, func(page playwright.Page) error {
|
||||
errRet = e.visitPage(task, func(page playwright.Page) error {
|
||||
parser := pageParser{
|
||||
task: task,
|
||||
page: page,
|
||||
@ -100,7 +156,7 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
|
||||
}
|
||||
|
||||
func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTaskResult, errRet error) {
|
||||
errRet = e.visitPage(task.URL, func(page playwright.Page) error {
|
||||
errRet = e.visitPage(task, func(page playwright.Page) error {
|
||||
err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||
State: playwright.LoadStateNetworkidle,
|
||||
Timeout: pwDuration("5s"),
|
||||
|
||||
@ -4,6 +4,7 @@ import (
|
||||
"fmt"
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"net/url"
|
||||
"slices"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@ -51,3 +52,33 @@ func parseProxy(s string) (*playwright.Proxy, error) {
|
||||
}
|
||||
return proxy, nil
|
||||
}
|
||||
|
||||
func parseBaseDomain(urlStr string) (string, error) {
|
||||
pageUrl, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("task url parsing: %w", err)
|
||||
}
|
||||
domainParts := strings.Split(pageUrl.Host, ".")
|
||||
slices.Reverse(domainParts) // com, example, www
|
||||
return fmt.Sprintf("%s.%s", domainParts[1], domainParts[0]), nil
|
||||
}
|
||||
|
||||
func parseCookieString(cookieStr string) (map[string]string, error) {
|
||||
result := make(map[string]string)
|
||||
failed := fmt.Errorf("failed to parse cookies")
|
||||
|
||||
for _, cook := range strings.Split(cookieStr, ";") {
|
||||
kv := strings.Split(cook, "=")
|
||||
if len(kv) != 2 {
|
||||
return nil, failed
|
||||
}
|
||||
k, err1 := url.QueryUnescape(kv[0])
|
||||
v, err2 := url.QueryUnescape(kv[1])
|
||||
if err1 != nil || err2 != nil {
|
||||
return nil, failed
|
||||
}
|
||||
result[k] = v
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
@ -25,6 +25,7 @@ type Task struct {
|
||||
SelectorCreated string
|
||||
SelectorContent string
|
||||
SelectorEnclosure string
|
||||
Headers map[string]string
|
||||
}
|
||||
|
||||
func (t Task) CacheKey() string {
|
||||
@ -38,6 +39,7 @@ func (t Task) CacheKey() string {
|
||||
h.Write([]byte(t.SelectorCreated))
|
||||
h.Write([]byte(t.SelectorContent))
|
||||
h.Write([]byte(t.SelectorEnclosure))
|
||||
h.Write([]byte(fmt.Sprintf("%+v", t.Headers)))
|
||||
return fmt.Sprintf("%s_%x", t.TaskType, h.Sum(nil))
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user