From ab4966052b33613c38a8fbf8349878061d2f115e Mon Sep 17 00:00:00 2001 From: Egor Aristov Date: Wed, 22 Jan 2025 17:15:38 +0300 Subject: [PATCH] screenshots; some refactoring --- cmd/worker/worker.go | 13 ++-- frontend/wizard/index.html | 5 +- frontend/wizard/main.css | 2 +- frontend/wizard/main.js | 17 ++++- internal/delivery/http/handler.go | 45 +++++++++++-- .../extractors/pwextractor/pwextractor.go | 67 ++++++++++++++----- internal/models/models.go | 14 +++- 7 files changed, 132 insertions(+), 31 deletions(-) diff --git a/cmd/worker/worker.go b/cmd/worker/worker.go index 69f2c40..ffbcb5d 100644 --- a/cmd/worker/worker.go +++ b/cmd/worker/worker.go @@ -63,10 +63,15 @@ func main() { errRet = fmt.Errorf("unmarshal task: %w", err) return } - cacheKey = task.CacheKey() - result, err := pwe.Extract(task) + var result any + switch task.TaskType { + case models.TaskTypeExtract: + result, err = pwe.Extract(task) + case models.TaskTypePageScreenshot: + result, err = pwe.Screenshot(task) + } if err != nil { - errRet = fmt.Errorf("extract: %w", err) + errRet = fmt.Errorf("task processing: %w", err) return } resultPayoad, err = json.Marshal(result) @@ -74,7 +79,7 @@ func main() { errRet = fmt.Errorf("marshal result: %w", err) return } - return + return task.CacheKey(), resultPayoad, errRet }) if err != nil { log.Panicf("consume queue: %v", err) diff --git a/frontend/wizard/index.html b/frontend/wizard/index.html index cd3d80c..680f366 100644 --- a/frontend/wizard/index.html +++ b/frontend/wizard/index.html @@ -13,7 +13,10 @@
-
+
diff --git a/frontend/wizard/main.css b/frontend/wizard/main.css index 7b19695..0a35b0f 100644 --- a/frontend/wizard/main.css +++ b/frontend/wizard/main.css @@ -1,4 +1,4 @@ -#ready_url_link { +#ready_url_link, #page_screenshot_link { visibility: hidden; } diff --git a/frontend/wizard/main.js b/frontend/wizard/main.js index f4a0910..81e7787 100644 --- a/frontend/wizard/main.js +++ b/frontend/wizard/main.js @@ -47,24 +47,35 @@ function displayUrl(url) { } function baseUrl() { - return document.location.origin + '/api/v1/render/'; + return document.location.origin + '/api/v1'; } async function genUrl() { let specs = readSpecsForm(); let encodedSpecs = await encodeSpecs(specs); - let url = baseUrl() + encodedSpecs; + let url = baseUrl() + '/render/' + encodedSpecs; displayUrl(url); } async function editUrl() { let url = document.getElementById('url_input').value; - let specs = await decodeSpecs(url.replace(baseUrl(), '')); + let specs = await decodeSpecs(url.replace(baseUrl() + '/render/', '')); writeSpecsToForm(specs); displayUrl(url); } +function onUrlInput() { + let url = document.forms['wizard'].elements['url'].value; + if (url.trim().length > 0) { + document.getElementById('page_screenshot_link').style.visibility = 'visible'; + document.getElementById('page_screenshot_link').href = `${baseUrl()}/screenshot?url=${url}`; + } else { + document.getElementById('page_screenshot_link').style.visibility = 'hidden'; + } +} + document.addEventListener('DOMContentLoaded', ev => { document.getElementById('btn_gen_url').addEventListener('click', genUrl); document.getElementById('btn_edit').addEventListener('click', editUrl); + document.getElementById('w_url').addEventListener('input', onUrlInput); }); diff --git a/internal/delivery/http/handler.go b/internal/delivery/http/handler.go index d16b33c..9a33820 100644 --- a/internal/delivery/http/handler.go +++ b/internal/delivery/http/handler.go @@ -16,9 +16,16 @@ import ( "github.com/labstack/gommon/log" "html" "io" + "net/url" "time" ) +const ( + taskTimeout = 20 * time.Second + minLifetime = taskTimeout + maxLifetime = 24 * time.Hour +) + type Handler struct { validate *validator.Validate CachedQueue adapters.CachedWorkQueue @@ -35,6 +42,7 @@ func New(cq adapters.CachedWorkQueue) *Handler { func (h *Handler) SetupRoutes(g *echo.Group) { g.GET("/render/:specs", h.handleRender) + g.GET("/screenshot", h.handlePageScreenshot) } type Specs struct { @@ -58,6 +66,7 @@ func (h *Handler) handleRender(c echo.Context) error { } task := models.Task{ + TaskType: models.TaskTypeExtract, URL: specs.URL, SelectorPost: specs.SelectorPost, SelectorTitle: specs.SelectorTitle, @@ -69,9 +78,6 @@ func (h *Handler) handleRender(c echo.Context) error { SelectorEnclosure: specs.SelectorEnclosure, } - taskTimeout := 20 * time.Second - minLifetime := taskTimeout - maxLifetime := 24 * time.Hour cacheLifetime, err := time.ParseDuration(specs.CacheLifetime) if err != nil { return echo.NewHTTPError(400, "invalid cache lifetime") @@ -98,7 +104,6 @@ func (h *Handler) handleRender(c echo.Context) error { var result models.TaskResult if err := json.Unmarshal(taskResultBytes, &result); err != nil { - log.Errorf("cached value unmarshal failed: %v", err) return echo.NewHTTPError(500, fmt.Errorf("cached value unmarshal failed: %v", err)) } @@ -112,6 +117,38 @@ func (h *Handler) handleRender(c echo.Context) error { return c.String(200, atom) } +func (h *Handler) handlePageScreenshot(c echo.Context) error { + pageUrl := c.QueryParam("url") + if _, err := url.Parse(pageUrl); err != nil { + return echo.NewHTTPError(400, "url is invalid or missing") + } + + task := models.Task{ + TaskType: models.TaskTypePageScreenshot, + URL: pageUrl, + } + + timeoutCtx, cancel := context.WithTimeout(context.Background(), taskTimeout) + defer cancel() + + encodedTask, err := json.Marshal(task) + if err != nil { + return echo.NewHTTPError(500, fmt.Errorf("task marshal error: %v", err)) + } + + cacheLifetime := minLifetime + taskResultBytes, err := h.CachedQueue.ProcessWorkCached(timeoutCtx, cacheLifetime, task.CacheKey(), encodedTask) + if err != nil { + return echo.NewHTTPError(500, fmt.Errorf("queued cache failed: %v", err)) + } + + var result models.ScreenshotTaskResult + if err := json.Unmarshal(taskResultBytes, &result); err != nil { + return echo.NewHTTPError(500, fmt.Errorf("task result unmarshal failed: %v", err)) + } + return c.Blob(200, "image/png", result.Image) +} + func (h *Handler) decodeSpecs(specsParam string) (Specs, error) { decodedSpecsParam, err := base64.StdEncoding.WithPadding(base64.NoPadding).DecodeString(specsParam) if err != nil { diff --git a/internal/extractors/pwextractor/pwextractor.go b/internal/extractors/pwextractor/pwextractor.go index a6c5ef9..99885a5 100644 --- a/internal/extractors/pwextractor/pwextractor.go +++ b/internal/extractors/pwextractor/pwextractor.go @@ -58,10 +58,10 @@ func (e *PwExtractor) Stop() error { return nil } -func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) { +func (e *PwExtractor) visitPage(pageUrl string, cb func(page playwright.Page) error) (errRet error) { page, err := e.chrome.NewPage() if err != nil { - return nil, fmt.Errorf("browser new page: %w", err) + return fmt.Errorf("browser new page: %w", err) } defer func() { err := page.Close() @@ -71,25 +71,58 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR }() log.Debugf("Page opened") - if _, err := page.Goto(task.URL); err != nil { - return nil, fmt.Errorf("goto page: %w", err) + if _, err := page.Goto(pageUrl); err != nil { + return fmt.Errorf("goto page: %w", err) } - log.Debugf("Url %s visited", task.URL) + log.Debugf("Url %s visited", pageUrl) + defer log.Debugf("Visiting page %s finished", pageUrl) - parser := pageParser{ - task: task, - page: page, - } + return cb(page) +} - result, err = parser.parse() - if err != nil { - return nil, fmt.Errorf("parse page: %w", err) - } - if len(result.Items) == 0 { - return nil, fmt.Errorf("extract failed for all posts") - } +func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) { + errRet = e.visitPage(task.URL, func(page playwright.Page) error { + parser := pageParser{ + task: task, + page: page, + } + var err error + result, err = parser.parse() + if err != nil { + return fmt.Errorf("parse page: %w", err) + } + if len(result.Items) == 0 { + return fmt.Errorf("extract failed for all posts") + } + return nil + }) + return +} - return result, nil +func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTaskResult, errRet error) { + errRet = e.visitPage(task.URL, func(page playwright.Page) error { + err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ + State: playwright.LoadStateNetworkidle, + Timeout: pwDuration("5s"), + }) + if err != nil { + log.Debugf("Wait for network idle: %w", err) + } + if err := page.SetViewportSize(1280, 800); err != nil { + return fmt.Errorf("set viewport size: %w", err) + } + screenshot, err := page.Screenshot(playwright.PageScreenshotOptions{ + Animations: playwright.ScreenshotAnimationsDisabled, + Timeout: pwDuration("5s"), + }) + if err != nil { + return fmt.Errorf("make screenshot: %w", err) + } + log.Infof("Screenshot finished; total size: %d bytes", len(screenshot)) + result = &models.ScreenshotTaskResult{Image: screenshot} + return nil + }) + return } type pageParser struct { diff --git a/internal/models/models.go b/internal/models/models.go index 50a0757..689dbe9 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -6,8 +6,16 @@ import ( "time" ) +type TaskType string + +const ( + TaskTypeExtract = "extract" + TaskTypePageScreenshot = "page_screenshot" +) + type Task struct { // While adding new fields, dont forget to alter caching func + TaskType TaskType URL string SelectorPost string SelectorTitle string @@ -30,7 +38,7 @@ func (t Task) CacheKey() string { h.Write([]byte(t.SelectorCreated)) h.Write([]byte(t.SelectorContent)) h.Write([]byte(t.SelectorEnclosure)) - return fmt.Sprintf("%x", h.Sum(nil)) + return fmt.Sprintf("%s_%x", t.TaskType, h.Sum(nil)) } type FeedItem struct { @@ -50,3 +58,7 @@ type TaskResult struct { Items []FeedItem Icon string } + +type ScreenshotTaskResult struct { + Image []byte // png +}