screenshots; some refactoring
This commit is contained in:
parent
f91d4416dc
commit
ab4966052b
@ -63,10 +63,15 @@ func main() {
|
|||||||
errRet = fmt.Errorf("unmarshal task: %w", err)
|
errRet = fmt.Errorf("unmarshal task: %w", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
cacheKey = task.CacheKey()
|
var result any
|
||||||
result, err := pwe.Extract(task)
|
switch task.TaskType {
|
||||||
|
case models.TaskTypeExtract:
|
||||||
|
result, err = pwe.Extract(task)
|
||||||
|
case models.TaskTypePageScreenshot:
|
||||||
|
result, err = pwe.Screenshot(task)
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
errRet = fmt.Errorf("extract: %w", err)
|
errRet = fmt.Errorf("task processing: %w", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
resultPayoad, err = json.Marshal(result)
|
resultPayoad, err = json.Marshal(result)
|
||||||
@ -74,7 +79,7 @@ func main() {
|
|||||||
errRet = fmt.Errorf("marshal result: %w", err)
|
errRet = fmt.Errorf("marshal result: %w", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
return
|
return task.CacheKey(), resultPayoad, errRet
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Panicf("consume queue: %v", err)
|
log.Panicf("consume queue: %v", err)
|
||||||
|
|||||||
@ -13,7 +13,10 @@
|
|||||||
<form name="wizard">
|
<form name="wizard">
|
||||||
<div class="field">
|
<div class="field">
|
||||||
<div class="label"><label for="w_url">URL of page for converting</label></div>
|
<div class="label"><label for="w_url">URL of page for converting</label></div>
|
||||||
<div class="input"><input type="url" name="url" id="w_url"/></div>
|
<div class="input">
|
||||||
|
<input type="url" name="url" id="w_url"/>
|
||||||
|
<a id="page_screenshot_link" target="_blank">Render screenshot</a>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="field">
|
<div class="field">
|
||||||
<div class="label"><label for="w_selector_post">CSS Selector for post</label></div>
|
<div class="label"><label for="w_selector_post">CSS Selector for post</label></div>
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
#ready_url_link {
|
#ready_url_link, #page_screenshot_link {
|
||||||
visibility: hidden;
|
visibility: hidden;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -47,24 +47,35 @@ function displayUrl(url) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function baseUrl() {
|
function baseUrl() {
|
||||||
return document.location.origin + '/api/v1/render/';
|
return document.location.origin + '/api/v1';
|
||||||
}
|
}
|
||||||
|
|
||||||
async function genUrl() {
|
async function genUrl() {
|
||||||
let specs = readSpecsForm();
|
let specs = readSpecsForm();
|
||||||
let encodedSpecs = await encodeSpecs(specs);
|
let encodedSpecs = await encodeSpecs(specs);
|
||||||
let url = baseUrl() + encodedSpecs;
|
let url = baseUrl() + '/render/' + encodedSpecs;
|
||||||
displayUrl(url);
|
displayUrl(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function editUrl() {
|
async function editUrl() {
|
||||||
let url = document.getElementById('url_input').value;
|
let url = document.getElementById('url_input').value;
|
||||||
let specs = await decodeSpecs(url.replace(baseUrl(), ''));
|
let specs = await decodeSpecs(url.replace(baseUrl() + '/render/', ''));
|
||||||
writeSpecsToForm(specs);
|
writeSpecsToForm(specs);
|
||||||
displayUrl(url);
|
displayUrl(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function onUrlInput() {
|
||||||
|
let url = document.forms['wizard'].elements['url'].value;
|
||||||
|
if (url.trim().length > 0) {
|
||||||
|
document.getElementById('page_screenshot_link').style.visibility = 'visible';
|
||||||
|
document.getElementById('page_screenshot_link').href = `${baseUrl()}/screenshot?url=${url}`;
|
||||||
|
} else {
|
||||||
|
document.getElementById('page_screenshot_link').style.visibility = 'hidden';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
document.addEventListener('DOMContentLoaded', ev => {
|
document.addEventListener('DOMContentLoaded', ev => {
|
||||||
document.getElementById('btn_gen_url').addEventListener('click', genUrl);
|
document.getElementById('btn_gen_url').addEventListener('click', genUrl);
|
||||||
document.getElementById('btn_edit').addEventListener('click', editUrl);
|
document.getElementById('btn_edit').addEventListener('click', editUrl);
|
||||||
|
document.getElementById('w_url').addEventListener('input', onUrlInput);
|
||||||
});
|
});
|
||||||
|
|||||||
@ -16,9 +16,16 @@ import (
|
|||||||
"github.com/labstack/gommon/log"
|
"github.com/labstack/gommon/log"
|
||||||
"html"
|
"html"
|
||||||
"io"
|
"io"
|
||||||
|
"net/url"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
taskTimeout = 20 * time.Second
|
||||||
|
minLifetime = taskTimeout
|
||||||
|
maxLifetime = 24 * time.Hour
|
||||||
|
)
|
||||||
|
|
||||||
type Handler struct {
|
type Handler struct {
|
||||||
validate *validator.Validate
|
validate *validator.Validate
|
||||||
CachedQueue adapters.CachedWorkQueue
|
CachedQueue adapters.CachedWorkQueue
|
||||||
@ -35,6 +42,7 @@ func New(cq adapters.CachedWorkQueue) *Handler {
|
|||||||
|
|
||||||
func (h *Handler) SetupRoutes(g *echo.Group) {
|
func (h *Handler) SetupRoutes(g *echo.Group) {
|
||||||
g.GET("/render/:specs", h.handleRender)
|
g.GET("/render/:specs", h.handleRender)
|
||||||
|
g.GET("/screenshot", h.handlePageScreenshot)
|
||||||
}
|
}
|
||||||
|
|
||||||
type Specs struct {
|
type Specs struct {
|
||||||
@ -58,6 +66,7 @@ func (h *Handler) handleRender(c echo.Context) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
task := models.Task{
|
task := models.Task{
|
||||||
|
TaskType: models.TaskTypeExtract,
|
||||||
URL: specs.URL,
|
URL: specs.URL,
|
||||||
SelectorPost: specs.SelectorPost,
|
SelectorPost: specs.SelectorPost,
|
||||||
SelectorTitle: specs.SelectorTitle,
|
SelectorTitle: specs.SelectorTitle,
|
||||||
@ -69,9 +78,6 @@ func (h *Handler) handleRender(c echo.Context) error {
|
|||||||
SelectorEnclosure: specs.SelectorEnclosure,
|
SelectorEnclosure: specs.SelectorEnclosure,
|
||||||
}
|
}
|
||||||
|
|
||||||
taskTimeout := 20 * time.Second
|
|
||||||
minLifetime := taskTimeout
|
|
||||||
maxLifetime := 24 * time.Hour
|
|
||||||
cacheLifetime, err := time.ParseDuration(specs.CacheLifetime)
|
cacheLifetime, err := time.ParseDuration(specs.CacheLifetime)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return echo.NewHTTPError(400, "invalid cache lifetime")
|
return echo.NewHTTPError(400, "invalid cache lifetime")
|
||||||
@ -98,7 +104,6 @@ func (h *Handler) handleRender(c echo.Context) error {
|
|||||||
|
|
||||||
var result models.TaskResult
|
var result models.TaskResult
|
||||||
if err := json.Unmarshal(taskResultBytes, &result); err != nil {
|
if err := json.Unmarshal(taskResultBytes, &result); err != nil {
|
||||||
log.Errorf("cached value unmarshal failed: %v", err)
|
|
||||||
return echo.NewHTTPError(500, fmt.Errorf("cached value unmarshal failed: %v", err))
|
return echo.NewHTTPError(500, fmt.Errorf("cached value unmarshal failed: %v", err))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,6 +117,38 @@ func (h *Handler) handleRender(c echo.Context) error {
|
|||||||
return c.String(200, atom)
|
return c.String(200, atom)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *Handler) handlePageScreenshot(c echo.Context) error {
|
||||||
|
pageUrl := c.QueryParam("url")
|
||||||
|
if _, err := url.Parse(pageUrl); err != nil {
|
||||||
|
return echo.NewHTTPError(400, "url is invalid or missing")
|
||||||
|
}
|
||||||
|
|
||||||
|
task := models.Task{
|
||||||
|
TaskType: models.TaskTypePageScreenshot,
|
||||||
|
URL: pageUrl,
|
||||||
|
}
|
||||||
|
|
||||||
|
timeoutCtx, cancel := context.WithTimeout(context.Background(), taskTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
encodedTask, err := json.Marshal(task)
|
||||||
|
if err != nil {
|
||||||
|
return echo.NewHTTPError(500, fmt.Errorf("task marshal error: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
cacheLifetime := minLifetime
|
||||||
|
taskResultBytes, err := h.CachedQueue.ProcessWorkCached(timeoutCtx, cacheLifetime, task.CacheKey(), encodedTask)
|
||||||
|
if err != nil {
|
||||||
|
return echo.NewHTTPError(500, fmt.Errorf("queued cache failed: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
var result models.ScreenshotTaskResult
|
||||||
|
if err := json.Unmarshal(taskResultBytes, &result); err != nil {
|
||||||
|
return echo.NewHTTPError(500, fmt.Errorf("task result unmarshal failed: %v", err))
|
||||||
|
}
|
||||||
|
return c.Blob(200, "image/png", result.Image)
|
||||||
|
}
|
||||||
|
|
||||||
func (h *Handler) decodeSpecs(specsParam string) (Specs, error) {
|
func (h *Handler) decodeSpecs(specsParam string) (Specs, error) {
|
||||||
decodedSpecsParam, err := base64.StdEncoding.WithPadding(base64.NoPadding).DecodeString(specsParam)
|
decodedSpecsParam, err := base64.StdEncoding.WithPadding(base64.NoPadding).DecodeString(specsParam)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@ -58,10 +58,10 @@ func (e *PwExtractor) Stop() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
|
func (e *PwExtractor) visitPage(pageUrl string, cb func(page playwright.Page) error) (errRet error) {
|
||||||
page, err := e.chrome.NewPage()
|
page, err := e.chrome.NewPage()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("browser new page: %w", err)
|
return fmt.Errorf("browser new page: %w", err)
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
err := page.Close()
|
err := page.Close()
|
||||||
@ -71,25 +71,58 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
|
|||||||
}()
|
}()
|
||||||
log.Debugf("Page opened")
|
log.Debugf("Page opened")
|
||||||
|
|
||||||
if _, err := page.Goto(task.URL); err != nil {
|
if _, err := page.Goto(pageUrl); err != nil {
|
||||||
return nil, fmt.Errorf("goto page: %w", err)
|
return fmt.Errorf("goto page: %w", err)
|
||||||
}
|
}
|
||||||
log.Debugf("Url %s visited", task.URL)
|
log.Debugf("Url %s visited", pageUrl)
|
||||||
|
defer log.Debugf("Visiting page %s finished", pageUrl)
|
||||||
|
|
||||||
|
return cb(page)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
|
||||||
|
errRet = e.visitPage(task.URL, func(page playwright.Page) error {
|
||||||
parser := pageParser{
|
parser := pageParser{
|
||||||
task: task,
|
task: task,
|
||||||
page: page,
|
page: page,
|
||||||
}
|
}
|
||||||
|
var err error
|
||||||
result, err = parser.parse()
|
result, err = parser.parse()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("parse page: %w", err)
|
return fmt.Errorf("parse page: %w", err)
|
||||||
}
|
}
|
||||||
if len(result.Items) == 0 {
|
if len(result.Items) == 0 {
|
||||||
return nil, fmt.Errorf("extract failed for all posts")
|
return fmt.Errorf("extract failed for all posts")
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
return result, nil
|
func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTaskResult, errRet error) {
|
||||||
|
errRet = e.visitPage(task.URL, func(page playwright.Page) error {
|
||||||
|
err := page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||||
|
State: playwright.LoadStateNetworkidle,
|
||||||
|
Timeout: pwDuration("5s"),
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
log.Debugf("Wait for network idle: %w", err)
|
||||||
|
}
|
||||||
|
if err := page.SetViewportSize(1280, 800); err != nil {
|
||||||
|
return fmt.Errorf("set viewport size: %w", err)
|
||||||
|
}
|
||||||
|
screenshot, err := page.Screenshot(playwright.PageScreenshotOptions{
|
||||||
|
Animations: playwright.ScreenshotAnimationsDisabled,
|
||||||
|
Timeout: pwDuration("5s"),
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("make screenshot: %w", err)
|
||||||
|
}
|
||||||
|
log.Infof("Screenshot finished; total size: %d bytes", len(screenshot))
|
||||||
|
result = &models.ScreenshotTaskResult{Image: screenshot}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
type pageParser struct {
|
type pageParser struct {
|
||||||
|
|||||||
@ -6,8 +6,16 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type TaskType string
|
||||||
|
|
||||||
|
const (
|
||||||
|
TaskTypeExtract = "extract"
|
||||||
|
TaskTypePageScreenshot = "page_screenshot"
|
||||||
|
)
|
||||||
|
|
||||||
type Task struct {
|
type Task struct {
|
||||||
// While adding new fields, dont forget to alter caching func
|
// While adding new fields, dont forget to alter caching func
|
||||||
|
TaskType TaskType
|
||||||
URL string
|
URL string
|
||||||
SelectorPost string
|
SelectorPost string
|
||||||
SelectorTitle string
|
SelectorTitle string
|
||||||
@ -30,7 +38,7 @@ func (t Task) CacheKey() string {
|
|||||||
h.Write([]byte(t.SelectorCreated))
|
h.Write([]byte(t.SelectorCreated))
|
||||||
h.Write([]byte(t.SelectorContent))
|
h.Write([]byte(t.SelectorContent))
|
||||||
h.Write([]byte(t.SelectorEnclosure))
|
h.Write([]byte(t.SelectorEnclosure))
|
||||||
return fmt.Sprintf("%x", h.Sum(nil))
|
return fmt.Sprintf("%s_%x", t.TaskType, h.Sum(nil))
|
||||||
}
|
}
|
||||||
|
|
||||||
type FeedItem struct {
|
type FeedItem struct {
|
||||||
@ -50,3 +58,7 @@ type TaskResult struct {
|
|||||||
Items []FeedItem
|
Items []FeedItem
|
||||||
Icon string
|
Icon string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ScreenshotTaskResult struct {
|
||||||
|
Image []byte // png
|
||||||
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user