From 0e32cc3f176baa3314e08b9eaf6a565e0b61f0fe Mon Sep 17 00:00:00 2001 From: Egor Aristov Date: Tue, 6 May 2025 20:47:10 +0300 Subject: [PATCH] extract from attrubite --- internal/api/http/handler.go | 32 ++++++++++++------- internal/extractors/pwextractor/pageparser.go | 10 +++++- internal/models/models.go | 31 +++++++++++------- 3 files changed, 50 insertions(+), 23 deletions(-) diff --git a/internal/api/http/handler.go b/internal/api/http/handler.go index 9b34509..13e34aa 100644 --- a/internal/api/http/handler.go +++ b/internal/api/http/handler.go @@ -75,18 +75,28 @@ func (h *Handler) handleRender(c echo.Context) error { return echo.NewHTTPError(400, fmt.Errorf("decode specs: %w", err)) } + extractFrom, ok := map[pb.ExtractFrom]models.ExtractFrom{ + pb.ExtractFrom_InnerText: models.ExtractFrom_InnerText, + pb.ExtractFrom_Attribute: models.ExtractFrom_Attribute, + }[specs.CreatedExtractFrom] + if !ok { + return echo.NewHTTPError(400, "invalid extract from") + } + task := models.Task{ - TaskType: models.TaskTypeExtract, - URL: specs.Url, - SelectorPost: specs.SelectorPost, - SelectorTitle: specs.SelectorTitle, - SelectorLink: specs.SelectorLink, - SelectorDescription: specs.SelectorDescription, - SelectorAuthor: specs.SelectorAuthor, - SelectorCreated: specs.SelectorCreated, - SelectorContent: specs.SelectorContent, - SelectorEnclosure: specs.SelectorEnclosure, - Headers: extractHeaders(c), + TaskType: models.TaskTypeExtract, + URL: specs.Url, + SelectorPost: specs.SelectorPost, + SelectorTitle: specs.SelectorTitle, + SelectorLink: specs.SelectorLink, + SelectorDescription: specs.SelectorDescription, + SelectorAuthor: specs.SelectorAuthor, + SelectorCreated: specs.SelectorCreated, + CreatedExtractFrom: extractFrom, + CreatedAttributeName: specs.CreatedAttributeName, + SelectorContent: specs.SelectorContent, + SelectorEnclosure: specs.SelectorEnclosure, + Headers: extractHeaders(c), } cacheLifetime, err := time.ParseDuration(specs.CacheLifetime) diff --git a/internal/extractors/pwextractor/pageparser.go b/internal/extractors/pwextractor/pageparser.go index ef1a99e..b2c9a53 100644 --- a/internal/extractors/pwextractor/pageparser.go +++ b/internal/extractors/pwextractor/pageparser.go @@ -114,7 +114,15 @@ func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, erro item.Enclosure = newLocator(post, p.task.SelectorEnclosure).First().GetAttribute("src") - createdDateStr := newLocator(post, p.task.SelectorCreated).First().InnerText() + var createdDateStr string + switch p.task.CreatedExtractFrom { + case models.ExtractFrom_InnerText: + createdDateStr = newLocator(post, p.task.SelectorCreated).First().InnerText() + case models.ExtractFrom_Attribute: + createdDateStr = newLocator(post, p.task.SelectorCreated).First().GetAttribute(p.task.CreatedAttributeName) + default: + return models.FeedItem{}, fmt.Errorf("invalid task.CreatedExtractFrom") + } log.Debugf("date=%s", createdDateStr) createdDate, err := p.dateParser.ParseDate(createdDateStr) if err != nil { diff --git a/internal/models/models.go b/internal/models/models.go index 7099d92..d5cf818 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -13,19 +13,28 @@ const ( TaskTypePageScreenshot = "page_screenshot" ) +type ExtractFrom int + +const ( + ExtractFrom_InnerText ExtractFrom = 0 + ExtractFrom_Attribute ExtractFrom = 1 +) + type Task struct { // While adding new fields, dont forget to alter caching func - TaskType TaskType - URL string - SelectorPost string - SelectorTitle string - SelectorLink string - SelectorDescription string - SelectorAuthor string - SelectorCreated string - SelectorContent string - SelectorEnclosure string - Headers map[string]string + TaskType TaskType + URL string + SelectorPost string + SelectorTitle string + SelectorLink string + SelectorDescription string + SelectorAuthor string + SelectorCreated string + CreatedExtractFrom ExtractFrom + CreatedAttributeName string + SelectorContent string + SelectorEnclosure string + Headers map[string]string } func (t Task) CacheKey() string {