headers and cookies support
This commit is contained in:
parent
ab4966052b
commit
5ff8b7e6d0
@ -126,6 +126,7 @@ func (h *Handler) handlePageScreenshot(c echo.Context) error {
|
|||||||
task := models.Task{
|
task := models.Task{
|
||||||
TaskType: models.TaskTypePageScreenshot,
|
TaskType: models.TaskTypePageScreenshot,
|
||||||
URL: pageUrl,
|
URL: pageUrl,
|
||||||
|
Headers: extractHeaders(c),
|
||||||
}
|
}
|
||||||
|
|
||||||
timeoutCtx, cancel := context.WithTimeout(context.Background(), taskTimeout)
|
timeoutCtx, cancel := context.WithTimeout(context.Background(), taskTimeout)
|
||||||
@ -201,3 +202,13 @@ func makeFeed(task models.Task, result models.TaskResult) (string, error) {
|
|||||||
}
|
}
|
||||||
return atom, nil
|
return atom, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func extractHeaders(c echo.Context) map[string]string {
|
||||||
|
headers := make(map[string]string)
|
||||||
|
for _, hName := range []string{"Accept-Language", "Cookie"} {
|
||||||
|
if len(c.Request().Header.Get(hName)) > 0 {
|
||||||
|
headers[hName] = c.Request().Header.Get(hName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return headers
|
||||||
|
}
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import (
|
|||||||
"github.com/labstack/gommon/log"
|
"github.com/labstack/gommon/log"
|
||||||
"github.com/markusmobius/go-dateparser"
|
"github.com/markusmobius/go-dateparser"
|
||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
|
"maps"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Timeouts
|
// Timeouts
|
||||||
@ -20,6 +21,9 @@ var (
|
|||||||
defOptEval = playwright.LocatorEvaluateOptions{Timeout: pwDuration(defTimeout)}
|
defOptEval = playwright.LocatorEvaluateOptions{Timeout: pwDuration(defTimeout)}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
|
||||||
|
var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"`
|
||||||
|
|
||||||
type PwExtractor struct {
|
type PwExtractor struct {
|
||||||
pw *playwright.Playwright
|
pw *playwright.Playwright
|
||||||
chrome playwright.Browser
|
chrome playwright.Browser
|
||||||
@ -58,30 +62,82 @@ func (e *PwExtractor) Stop() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *PwExtractor) visitPage(pageUrl string, cb func(page playwright.Page) error) (errRet error) {
|
func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) error) (errRet error) {
|
||||||
page, err := e.chrome.NewPage()
|
headers := maps.Clone(task.Headers)
|
||||||
|
headers["Sec-Ch-Ua"] = secChUa
|
||||||
|
var cookieStr string
|
||||||
|
if v, ok := headers["Cookie"]; ok {
|
||||||
|
cookieStr = v
|
||||||
|
delete(headers, "Cookie")
|
||||||
|
}
|
||||||
|
|
||||||
|
bCtx, err := e.chrome.NewContext(playwright.BrowserNewContextOptions{
|
||||||
|
ExtraHttpHeaders: headers,
|
||||||
|
UserAgent: &userAgent,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("create browser context: %w", err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if err := bCtx.Close(); err != nil {
|
||||||
|
errRet = fmt.Errorf("close context: %w; other error=%w", err, errRet)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if len(cookieStr) > 0 {
|
||||||
|
cookies, err := parseCookieString(cookieStr)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("parsing cookies: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
baseDomain, err := parseBaseDomain(task.URL)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("parse base domain: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var pwCookies []playwright.OptionalCookie
|
||||||
|
for k, v := range cookies {
|
||||||
|
pwCookies = append(pwCookies, playwright.OptionalCookie{
|
||||||
|
Name: k,
|
||||||
|
Value: v,
|
||||||
|
Domain: playwright.String(fmt.Sprintf(".%s", baseDomain)),
|
||||||
|
Path: playwright.String("/"),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := bCtx.AddCookies(pwCookies); err != nil {
|
||||||
|
return fmt.Errorf("add cookies: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
page, err := bCtx.NewPage()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("browser new page: %w", err)
|
return fmt.Errorf("browser new page: %w", err)
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
err := page.Close()
|
if err := page.Close(); err != nil {
|
||||||
if err != nil {
|
|
||||||
errRet = fmt.Errorf("close page: %w; other error=%w", err, errRet)
|
errRet = fmt.Errorf("close page: %w; other error=%w", err, errRet)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
log.Debugf("Page opened")
|
log.Debugf("Page opened")
|
||||||
|
|
||||||
if _, err := page.Goto(pageUrl); err != nil {
|
if len(task.Headers) > 0 {
|
||||||
|
if err := page.SetExtraHTTPHeaders(task.Headers); err != nil {
|
||||||
|
return fmt.Errorf("set headers: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := page.Goto(task.URL, playwright.PageGotoOptions{Timeout: pwDuration("10s")}); err != nil {
|
||||||
return fmt.Errorf("goto page: %w", err)
|
return fmt.Errorf("goto page: %w", err)
|
||||||
}
|
}
|
||||||
log.Debugf("Url %s visited", pageUrl)
|
log.Debugf("Url %s visited", task.URL)
|
||||||
defer log.Debugf("Visiting page %s finished", pageUrl)
|
defer log.Debugf("Visiting page %s finished", task.URL)
|
||||||
|
|
||||||
return cb(page)
|
return cb(page)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
|
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
|
||||||
errRet = e.visitPage(task.URL, func(page playwright.Page) error {
|
errRet = e.visitPage(task, func(page playwright.Page) error {
|
||||||
parser := pageParser{
|
parser := pageParser{
|
||||||
task: task,
|
task: task,
|
||||||
page: page,
|
page: page,
|
||||||
@ -100,7 +156,7 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTaskResult, errRet error) {
|
func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTaskResult, errRet error) {
|
||||||
errRet = e.visitPage(task.URL, func(page playwright.Page) error {
|
errRet = e.visitPage(task, func(page playwright.Page) error {
|
||||||
err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||||
State: playwright.LoadStateNetworkidle,
|
State: playwright.LoadStateNetworkidle,
|
||||||
Timeout: pwDuration("5s"),
|
Timeout: pwDuration("5s"),
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@ -51,3 +52,33 @@ func parseProxy(s string) (*playwright.Proxy, error) {
|
|||||||
}
|
}
|
||||||
return proxy, nil
|
return proxy, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseBaseDomain(urlStr string) (string, error) {
|
||||||
|
pageUrl, err := url.Parse(urlStr)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("task url parsing: %w", err)
|
||||||
|
}
|
||||||
|
domainParts := strings.Split(pageUrl.Host, ".")
|
||||||
|
slices.Reverse(domainParts) // com, example, www
|
||||||
|
return fmt.Sprintf("%s.%s", domainParts[1], domainParts[0]), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseCookieString(cookieStr string) (map[string]string, error) {
|
||||||
|
result := make(map[string]string)
|
||||||
|
failed := fmt.Errorf("failed to parse cookies")
|
||||||
|
|
||||||
|
for _, cook := range strings.Split(cookieStr, ";") {
|
||||||
|
kv := strings.Split(cook, "=")
|
||||||
|
if len(kv) != 2 {
|
||||||
|
return nil, failed
|
||||||
|
}
|
||||||
|
k, err1 := url.QueryUnescape(kv[0])
|
||||||
|
v, err2 := url.QueryUnescape(kv[1])
|
||||||
|
if err1 != nil || err2 != nil {
|
||||||
|
return nil, failed
|
||||||
|
}
|
||||||
|
result[k] = v
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|||||||
@ -25,6 +25,7 @@ type Task struct {
|
|||||||
SelectorCreated string
|
SelectorCreated string
|
||||||
SelectorContent string
|
SelectorContent string
|
||||||
SelectorEnclosure string
|
SelectorEnclosure string
|
||||||
|
Headers map[string]string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t Task) CacheKey() string {
|
func (t Task) CacheKey() string {
|
||||||
@ -38,6 +39,7 @@ func (t Task) CacheKey() string {
|
|||||||
h.Write([]byte(t.SelectorCreated))
|
h.Write([]byte(t.SelectorCreated))
|
||||||
h.Write([]byte(t.SelectorContent))
|
h.Write([]byte(t.SelectorContent))
|
||||||
h.Write([]byte(t.SelectorEnclosure))
|
h.Write([]byte(t.SelectorEnclosure))
|
||||||
|
h.Write([]byte(fmt.Sprintf("%+v", t.Headers)))
|
||||||
return fmt.Sprintf("%s_%x", t.TaskType, h.Sum(nil))
|
return fmt.Sprintf("%s_%x", t.TaskType, h.Sum(nil))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user