From 925bf49a8e8ce8d09e59b9737dc731938563bffc Mon Sep 17 00:00:00 2001 From: Egor Aristov Date: Mon, 27 Jan 2025 17:32:29 +0300 Subject: [PATCH] compare revs (for testing) Support mocking dates Support mocking dates Support mocking dates Support mocking dates --- .gitignore | 6 +- cmd/extractor/extractor.go | 21 +++++- cmd/worker/worker.go | 9 ++- compare_revs.sh | 67 +++++++++++++++++++ go.mod | 1 - go.sum | 2 - internal/dateparser/dateparser.go | 34 ++++++++++ .../extractors/pwextractor/pwextractor.go | 35 +++++++--- internal/extractors/pwextractor/utils.go | 23 ------- test_tasks/vombat.json | 13 ++++ 10 files changed, 168 insertions(+), 43 deletions(-) create mode 100644 compare_revs.sh create mode 100644 internal/dateparser/dateparser.go create mode 100644 test_tasks/vombat.json diff --git a/.gitignore b/.gitignore index 641b765..ad9281f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,7 @@ /trash/ /todo.md /.env.dev -/task.json -/screenshot.png +/task*.json +/screenshot*.png +node_modules +.vite diff --git a/cmd/extractor/extractor.go b/cmd/extractor/extractor.go index bcf7113..ba5abaf 100644 --- a/cmd/extractor/extractor.go +++ b/cmd/extractor/extractor.go @@ -3,19 +3,26 @@ package main import ( "encoding/json" "github.com/egor3f/rssalchemy/internal/config" + "github.com/egor3f/rssalchemy/internal/dateparser" "github.com/egor3f/rssalchemy/internal/extractors/pwextractor" "github.com/egor3f/rssalchemy/internal/models" "github.com/labstack/gommon/log" "github.com/yassinebenaid/godump" "io" "os" + "time" ) func main() { log.SetLevel(log.DEBUG) - log.SetHeader(`${time_rfc3339_nano} ${level}`) + log.SetHeader(`${level}`) - taskFile, err := os.Open("task.json") + taskFileName := "task.json" + if len(os.Args) > 1 { + taskFileName = os.Args[1] + } + + taskFile, err := os.Open(taskFileName) if err != nil { log.Panicf("open file: %v", err) } @@ -35,7 +42,14 @@ func main() { log.Panicf("read config: %v", err) } - pwe, err := pwextractor.New(cfg) + pwe, err := pwextractor.New(pwextractor.Config{ + Proxy: cfg.Proxy, + DateParser: &dateparser.DateParser{ + CurrentTimeFunc: func() time.Time { + return time.Date(2025, 01, 10, 10, 00, 00, 00, time.UTC) + }, + }, + }) if err != nil { log.Panicf("create pw extractor: %v", err) } @@ -51,6 +65,7 @@ func main() { scrResult, err := pwe.Screenshot(task) if err != nil { log.Errorf("screenshot failed: %v", err) + return } err = os.WriteFile("screenshot.png", scrResult.Image, 0600) if err != nil { diff --git a/cmd/worker/worker.go b/cmd/worker/worker.go index ffbcb5d..2fdc748 100644 --- a/cmd/worker/worker.go +++ b/cmd/worker/worker.go @@ -6,12 +6,14 @@ import ( "fmt" "github.com/egor3f/rssalchemy/internal/adapters/natsadapter" "github.com/egor3f/rssalchemy/internal/config" + "github.com/egor3f/rssalchemy/internal/dateparser" "github.com/egor3f/rssalchemy/internal/extractors/pwextractor" "github.com/egor3f/rssalchemy/internal/models" "github.com/labstack/gommon/log" "github.com/nats-io/nats.go" "os" "os/signal" + "time" ) func main() { @@ -47,7 +49,12 @@ func main() { log.Panicf("create nats adapter: %v", err) } - pwe, err := pwextractor.New(cfg) + pwe, err := pwextractor.New(pwextractor.Config{ + Proxy: cfg.Proxy, + DateParser: &dateparser.DateParser{ + CurrentTimeFunc: time.Now, + }, + }) if err != nil { log.Panicf("create pw extractor: %v", err) } diff --git a/compare_revs.sh b/compare_revs.sh new file mode 100644 index 0000000..4e04c96 --- /dev/null +++ b/compare_revs.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +# Copy project to temp directory and then reset it to HEAD to capture output of last commited version +# Then go back to current dir, capture output of current working tree +# Compare outputs for several tasks, notify if differ +# Caveats: this test uses real websites and parsing tasks - so it's not idempotent. +# I should think about better solution + +set -e + +old_dir=$(mktemp -d) +cur_dir=$(pwd) +task_dir=$cur_dir/test_tasks + +trap "echo cleaning up && rm -rf $old_dir && echo done" EXIT + +echo "Copying project to $old_dir" +time rsync -ar --exclude "node_modules" $cur_dir/ $old_dir +cd $old_dir +git reset --hard HEAD +cd - + +failed=0 + +for task in $task_dir/*; do + echo "Task $task" + old_out=$(mktemp) + echo "Old version output: $old_out" + cur_out=$(mktemp) + echo "Cur version output: $cur_out" + + set +e + cd $old_dir + rm -f $old_dir/screenshot.png + go run github.com/egor3f/rssalchemy/cmd/extractor "$task" > $old_out 2>&1 + if [ $? != 0 ]; then + echo "Failed to run old version" + cat $old_out + exit 1 + fi + cd - + go run github.com/egor3f/rssalchemy/cmd/extractor "$task" > $cur_out 2>&1 + if [ $? != 0 ]; then + echo "Failed to run new version" + cat $cur_out + exit 1 + fi + set -e + + if [ "$(cat $old_out)" != "$(cat $cur_out)" ]; then + echo "Output differ for $task. To inspect use: " + echo "diff -u $old_out $cur_out" + failed=$((failed + 1)) + if [ -f $old_dir/screenshot.png ]; then + cp $old_dir/screenshot.png $cur_dir/screenshot_old.png + echo Screenshot of old version output copied to cwd + fi + fi +done + +echo "-----------" +total=$(ls -1q $task_dir/* | wc -l) +echo "Failed: $failed of $total" + +if [ $failed > 0 ]; then + exit 1 +fi diff --git a/go.mod b/go.mod index 9d08717..3ed540e 100644 --- a/go.mod +++ b/go.mod @@ -42,7 +42,6 @@ require ( golang.org/x/crypto v0.32.0 // indirect golang.org/x/exp v0.0.0-20220321173239-a90fa8a75705 // indirect golang.org/x/net v0.34.0 // indirect - golang.org/x/sync v0.10.0 // indirect golang.org/x/sys v0.29.0 // indirect golang.org/x/text v0.21.0 // indirect golang.org/x/time v0.8.0 // indirect diff --git a/go.sum b/go.sum index dc2ed57..d725b36 100644 --- a/go.sum +++ b/go.sum @@ -109,8 +109,6 @@ golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= -golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/internal/dateparser/dateparser.go b/internal/dateparser/dateparser.go new file mode 100644 index 0000000..36ac2ca --- /dev/null +++ b/internal/dateparser/dateparser.go @@ -0,0 +1,34 @@ +package dateparser + +import ( + godateparser "github.com/markusmobius/go-dateparser" + "strings" + "time" +) + +type DateParser struct { + CurrentTimeFunc func() time.Time +} + +func (d *DateParser) ParseDate(str string) (time.Time, error) { + str = strings.TrimSpace(str) + + dt, err := godateparser.Parse(&godateparser.Configuration{ + CurrentTime: d.CurrentTimeFunc(), + }, str) + if err == nil { + return dt.Time, nil + } + + parts := strings.Split(str, " ") + for len(parts) > 1 { + newStr := strings.Join(parts, " ") + dt, err = godateparser.Parse(nil, newStr) + if err == nil { + return dt.Time, err + } + parts = parts[1:] + } + + return time.Time{}, err +} diff --git a/internal/extractors/pwextractor/pwextractor.go b/internal/extractors/pwextractor/pwextractor.go index eb8949e..2d9cb69 100644 --- a/internal/extractors/pwextractor/pwextractor.go +++ b/internal/extractors/pwextractor/pwextractor.go @@ -4,12 +4,12 @@ import ( "context" _ "embed" "fmt" - "github.com/egor3f/rssalchemy/internal/config" "github.com/egor3f/rssalchemy/internal/models" "github.com/labstack/gommon/log" "github.com/playwright-community/playwright-go" "maps" "strings" + "time" ) // Timeouts @@ -24,12 +24,22 @@ var ( var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36" var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"` -type PwExtractor struct { - pw *playwright.Playwright - chrome playwright.Browser +type DateParser interface { + ParseDate(string) (time.Time, error) } -func New(cfg config.Config) (*PwExtractor, error) { +type PwExtractor struct { + pw *playwright.Playwright + chrome playwright.Browser + dateParser DateParser +} + +type Config struct { + Proxy string + DateParser DateParser +} + +func New(cfg Config) (*PwExtractor, error) { e := PwExtractor{} var err error e.pw, err = playwright.Run() @@ -49,6 +59,7 @@ func New(cfg config.Config) (*PwExtractor, error) { if err != nil { return nil, fmt.Errorf("run chromium: %w", err) } + e.dateParser = cfg.DateParser return &e, nil } @@ -140,8 +151,9 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page) func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) { errRet = e.visitPage(task, func(page playwright.Page) error { parser := pageParser{ - task: task, - page: page, + task: task, + page: page, + dateParser: e.dateParser, } var err error result, err = parser.parse() @@ -183,8 +195,9 @@ func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTas } type pageParser struct { - task models.Task - page playwright.Page + task models.Task + page playwright.Page + dateParser DateParser // next fields only for debugging. Shit code, to do better later postIdx int @@ -303,11 +316,11 @@ func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, erro createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText)) log.Debugf("date=%s", createdDateStr) - createdDate, err := parseDate(createdDateStr) + createdDate, err := p.dateParser.ParseDate(createdDateStr) if err != nil { log.Errorf("dateparser: %v", err) } else { - item.Created = createdDate.Time + item.Created = createdDate } return item, nil diff --git a/internal/extractors/pwextractor/utils.go b/internal/extractors/pwextractor/utils.go index db3ac70..071b795 100644 --- a/internal/extractors/pwextractor/utils.go +++ b/internal/extractors/pwextractor/utils.go @@ -2,8 +2,6 @@ package pwextractor import ( "fmt" - "github.com/markusmobius/go-dateparser" - "github.com/markusmobius/go-dateparser/date" "github.com/playwright-community/playwright-go" "net/url" "slices" @@ -83,24 +81,3 @@ func parseCookieString(cookieStr string) ([][2]string, error) { return result, nil } - -func parseDate(str string) (d date.Date, err error) { - str = strings.TrimSpace(str) - - d, err = dateparser.Parse(nil, str) - if err == nil { - return - } - - parts := strings.Split(str, " ") - for len(parts) > 1 { - newStr := strings.Join(parts, " ") - d, err = dateparser.Parse(nil, newStr) - if err == nil { - return - } - parts = parts[1:] - } - - return -} diff --git a/test_tasks/vombat.json b/test_tasks/vombat.json new file mode 100644 index 0000000..6a3b79c --- /dev/null +++ b/test_tasks/vombat.json @@ -0,0 +1,13 @@ +{ + "TaskType": "extract", + "URL": "https://vombat.su/new/all", + "SelectorPost": "div.post-body", + "SelectorTitle": "h1 a", + "SelectorLink": "h1 a", + "SelectorDescription": "div.post-content-block p", + "SelectorAuthor": "a:has(\u003e span.post-author)", + "SelectorCreated": "div:nth-of-type(1) \u003e div:nth-of-type(1) \u003e div:nth-of-type(1) \u003e div:nth-of-type(2)", + "SelectorContent": "div.post-content-block", + "SelectorEnclosure": "article img.object-contain", + "Headers": {} +}