screenshots; some refactoring

This commit is contained in:
Egor Aristov 2025-01-22 17:15:38 +03:00
parent ca2087eb7b
commit bd33314298
Signed by: egor3f
GPG Key ID: 40482A264AAEC85F
7 changed files with 132 additions and 31 deletions

View File

@ -63,10 +63,15 @@ func main() {
errRet = fmt.Errorf("unmarshal task: %w", err) errRet = fmt.Errorf("unmarshal task: %w", err)
return return
} }
cacheKey = task.CacheKey() var result any
result, err := pwe.Extract(task) switch task.TaskType {
case models.TaskTypeExtract:
result, err = pwe.Extract(task)
case models.TaskTypePageScreenshot:
result, err = pwe.Screenshot(task)
}
if err != nil { if err != nil {
errRet = fmt.Errorf("extract: %w", err) errRet = fmt.Errorf("task processing: %w", err)
return return
} }
resultPayoad, err = json.Marshal(result) resultPayoad, err = json.Marshal(result)
@ -74,7 +79,7 @@ func main() {
errRet = fmt.Errorf("marshal result: %w", err) errRet = fmt.Errorf("marshal result: %w", err)
return return
} }
return return task.CacheKey(), resultPayoad, errRet
}) })
if err != nil { if err != nil {
log.Panicf("consume queue: %v", err) log.Panicf("consume queue: %v", err)

View File

@ -13,7 +13,10 @@
<form name="wizard"> <form name="wizard">
<div class="field"> <div class="field">
<div class="label"><label for="w_url">URL of page for converting</label></div> <div class="label"><label for="w_url">URL of page for converting</label></div>
<div class="input"><input type="url" name="url" id="w_url"/></div> <div class="input">
<input type="url" name="url" id="w_url"/>
<a id="page_screenshot_link" target="_blank">Render screenshot</a>
</div>
</div> </div>
<div class="field"> <div class="field">
<div class="label"><label for="w_selector_post">CSS Selector for post</label></div> <div class="label"><label for="w_selector_post">CSS Selector for post</label></div>

View File

@ -1,4 +1,4 @@
#ready_url_link { #ready_url_link, #page_screenshot_link {
visibility: hidden; visibility: hidden;
} }

View File

@ -47,24 +47,35 @@ function displayUrl(url) {
} }
function baseUrl() { function baseUrl() {
return document.location.origin + '/api/v1/render/'; return document.location.origin + '/api/v1';
} }
async function genUrl() { async function genUrl() {
let specs = readSpecsForm(); let specs = readSpecsForm();
let encodedSpecs = await encodeSpecs(specs); let encodedSpecs = await encodeSpecs(specs);
let url = baseUrl() + encodedSpecs; let url = baseUrl() + '/render/' + encodedSpecs;
displayUrl(url); displayUrl(url);
} }
async function editUrl() { async function editUrl() {
let url = document.getElementById('url_input').value; let url = document.getElementById('url_input').value;
let specs = await decodeSpecs(url.replace(baseUrl(), '')); let specs = await decodeSpecs(url.replace(baseUrl() + '/render/', ''));
writeSpecsToForm(specs); writeSpecsToForm(specs);
displayUrl(url); displayUrl(url);
} }
function onUrlInput() {
let url = document.forms['wizard'].elements['url'].value;
if (url.trim().length > 0) {
document.getElementById('page_screenshot_link').style.visibility = 'visible';
document.getElementById('page_screenshot_link').href = `${baseUrl()}/screenshot?url=${url}`;
} else {
document.getElementById('page_screenshot_link').style.visibility = 'hidden';
}
}
document.addEventListener('DOMContentLoaded', ev => { document.addEventListener('DOMContentLoaded', ev => {
document.getElementById('btn_gen_url').addEventListener('click', genUrl); document.getElementById('btn_gen_url').addEventListener('click', genUrl);
document.getElementById('btn_edit').addEventListener('click', editUrl); document.getElementById('btn_edit').addEventListener('click', editUrl);
document.getElementById('w_url').addEventListener('input', onUrlInput);
}); });

View File

@ -16,9 +16,16 @@ import (
"github.com/labstack/gommon/log" "github.com/labstack/gommon/log"
"html" "html"
"io" "io"
"net/url"
"time" "time"
) )
const (
taskTimeout = 20 * time.Second
minLifetime = taskTimeout
maxLifetime = 24 * time.Hour
)
type Handler struct { type Handler struct {
validate *validator.Validate validate *validator.Validate
CachedQueue adapters.CachedWorkQueue CachedQueue adapters.CachedWorkQueue
@ -35,6 +42,7 @@ func New(cq adapters.CachedWorkQueue) *Handler {
func (h *Handler) SetupRoutes(g *echo.Group) { func (h *Handler) SetupRoutes(g *echo.Group) {
g.GET("/render/:specs", h.handleRender) g.GET("/render/:specs", h.handleRender)
g.GET("/screenshot", h.handlePageScreenshot)
} }
type Specs struct { type Specs struct {
@ -58,6 +66,7 @@ func (h *Handler) handleRender(c echo.Context) error {
} }
task := models.Task{ task := models.Task{
TaskType: models.TaskTypeExtract,
URL: specs.URL, URL: specs.URL,
SelectorPost: specs.SelectorPost, SelectorPost: specs.SelectorPost,
SelectorTitle: specs.SelectorTitle, SelectorTitle: specs.SelectorTitle,
@ -69,9 +78,6 @@ func (h *Handler) handleRender(c echo.Context) error {
SelectorEnclosure: specs.SelectorEnclosure, SelectorEnclosure: specs.SelectorEnclosure,
} }
taskTimeout := 20 * time.Second
minLifetime := taskTimeout
maxLifetime := 24 * time.Hour
cacheLifetime, err := time.ParseDuration(specs.CacheLifetime) cacheLifetime, err := time.ParseDuration(specs.CacheLifetime)
if err != nil { if err != nil {
return echo.NewHTTPError(400, "invalid cache lifetime") return echo.NewHTTPError(400, "invalid cache lifetime")
@ -98,7 +104,6 @@ func (h *Handler) handleRender(c echo.Context) error {
var result models.TaskResult var result models.TaskResult
if err := json.Unmarshal(taskResultBytes, &result); err != nil { if err := json.Unmarshal(taskResultBytes, &result); err != nil {
log.Errorf("cached value unmarshal failed: %v", err)
return echo.NewHTTPError(500, fmt.Errorf("cached value unmarshal failed: %v", err)) return echo.NewHTTPError(500, fmt.Errorf("cached value unmarshal failed: %v", err))
} }
@ -112,6 +117,38 @@ func (h *Handler) handleRender(c echo.Context) error {
return c.String(200, atom) return c.String(200, atom)
} }
func (h *Handler) handlePageScreenshot(c echo.Context) error {
pageUrl := c.QueryParam("url")
if _, err := url.Parse(pageUrl); err != nil {
return echo.NewHTTPError(400, "url is invalid or missing")
}
task := models.Task{
TaskType: models.TaskTypePageScreenshot,
URL: pageUrl,
}
timeoutCtx, cancel := context.WithTimeout(context.Background(), taskTimeout)
defer cancel()
encodedTask, err := json.Marshal(task)
if err != nil {
return echo.NewHTTPError(500, fmt.Errorf("task marshal error: %v", err))
}
cacheLifetime := minLifetime
taskResultBytes, err := h.CachedQueue.ProcessWorkCached(timeoutCtx, cacheLifetime, task.CacheKey(), encodedTask)
if err != nil {
return echo.NewHTTPError(500, fmt.Errorf("queued cache failed: %v", err))
}
var result models.ScreenshotTaskResult
if err := json.Unmarshal(taskResultBytes, &result); err != nil {
return echo.NewHTTPError(500, fmt.Errorf("task result unmarshal failed: %v", err))
}
return c.Blob(200, "image/png", result.Image)
}
func (h *Handler) decodeSpecs(specsParam string) (Specs, error) { func (h *Handler) decodeSpecs(specsParam string) (Specs, error) {
decodedSpecsParam, err := base64.StdEncoding.WithPadding(base64.NoPadding).DecodeString(specsParam) decodedSpecsParam, err := base64.StdEncoding.WithPadding(base64.NoPadding).DecodeString(specsParam)
if err != nil { if err != nil {

View File

@ -58,10 +58,10 @@ func (e *PwExtractor) Stop() error {
return nil return nil
} }
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) { func (e *PwExtractor) visitPage(pageUrl string, cb func(page playwright.Page) error) (errRet error) {
page, err := e.chrome.NewPage() page, err := e.chrome.NewPage()
if err != nil { if err != nil {
return nil, fmt.Errorf("browser new page: %w", err) return fmt.Errorf("browser new page: %w", err)
} }
defer func() { defer func() {
err := page.Close() err := page.Close()
@ -71,25 +71,58 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
}() }()
log.Debugf("Page opened") log.Debugf("Page opened")
if _, err := page.Goto(task.URL); err != nil { if _, err := page.Goto(pageUrl); err != nil {
return nil, fmt.Errorf("goto page: %w", err) return fmt.Errorf("goto page: %w", err)
} }
log.Debugf("Url %s visited", task.URL) log.Debugf("Url %s visited", pageUrl)
defer log.Debugf("Visiting page %s finished", pageUrl)
return cb(page)
}
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
errRet = e.visitPage(task.URL, func(page playwright.Page) error {
parser := pageParser{ parser := pageParser{
task: task, task: task,
page: page, page: page,
} }
var err error
result, err = parser.parse() result, err = parser.parse()
if err != nil { if err != nil {
return nil, fmt.Errorf("parse page: %w", err) return fmt.Errorf("parse page: %w", err)
} }
if len(result.Items) == 0 { if len(result.Items) == 0 {
return nil, fmt.Errorf("extract failed for all posts") return fmt.Errorf("extract failed for all posts")
} }
return nil
})
return
}
return result, nil func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTaskResult, errRet error) {
errRet = e.visitPage(task.URL, func(page playwright.Page) error {
err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
State: playwright.LoadStateNetworkidle,
Timeout: pwDuration("5s"),
})
if err != nil {
log.Debugf("Wait for network idle: %w", err)
}
if err := page.SetViewportSize(1280, 800); err != nil {
return fmt.Errorf("set viewport size: %w", err)
}
screenshot, err := page.Screenshot(playwright.PageScreenshotOptions{
Animations: playwright.ScreenshotAnimationsDisabled,
Timeout: pwDuration("5s"),
})
if err != nil {
return fmt.Errorf("make screenshot: %w", err)
}
log.Infof("Screenshot finished; total size: %d bytes", len(screenshot))
result = &models.ScreenshotTaskResult{Image: screenshot}
return nil
})
return
} }
type pageParser struct { type pageParser struct {

View File

@ -6,8 +6,16 @@ import (
"time" "time"
) )
type TaskType string
const (
TaskTypeExtract = "extract"
TaskTypePageScreenshot = "page_screenshot"
)
type Task struct { type Task struct {
// While adding new fields, dont forget to alter caching func // While adding new fields, dont forget to alter caching func
TaskType TaskType
URL string URL string
SelectorPost string SelectorPost string
SelectorTitle string SelectorTitle string
@ -30,7 +38,7 @@ func (t Task) CacheKey() string {
h.Write([]byte(t.SelectorCreated)) h.Write([]byte(t.SelectorCreated))
h.Write([]byte(t.SelectorContent)) h.Write([]byte(t.SelectorContent))
h.Write([]byte(t.SelectorEnclosure)) h.Write([]byte(t.SelectorEnclosure))
return fmt.Sprintf("%x", h.Sum(nil)) return fmt.Sprintf("%s_%x", t.TaskType, h.Sum(nil))
} }
type FeedItem struct { type FeedItem struct {
@ -50,3 +58,7 @@ type TaskResult struct {
Items []FeedItem Items []FeedItem
Icon string Icon string
} }
type ScreenshotTaskResult struct {
Image []byte // png
}