compare revs (for testing)

Support mocking dates

Support mocking dates

Support mocking dates

Support mocking dates
This commit is contained in:
Egor Aristov 2025-01-27 17:32:29 +03:00
parent d54e464816
commit 728792da26
10 changed files with 168 additions and 43 deletions

6
.gitignore vendored
View File

@ -2,5 +2,7 @@
/trash/ /trash/
/todo.md /todo.md
/.env.dev /.env.dev
/task.json /task*.json
/screenshot.png /screenshot*.png
node_modules
.vite

View File

@ -3,19 +3,26 @@ package main
import ( import (
"encoding/json" "encoding/json"
"github.com/egor3f/rssalchemy/internal/config" "github.com/egor3f/rssalchemy/internal/config"
"github.com/egor3f/rssalchemy/internal/dateparser"
"github.com/egor3f/rssalchemy/internal/extractors/pwextractor" "github.com/egor3f/rssalchemy/internal/extractors/pwextractor"
"github.com/egor3f/rssalchemy/internal/models" "github.com/egor3f/rssalchemy/internal/models"
"github.com/labstack/gommon/log" "github.com/labstack/gommon/log"
"github.com/yassinebenaid/godump" "github.com/yassinebenaid/godump"
"io" "io"
"os" "os"
"time"
) )
func main() { func main() {
log.SetLevel(log.DEBUG) log.SetLevel(log.DEBUG)
log.SetHeader(`${time_rfc3339_nano} ${level}`) log.SetHeader(`${level}`)
taskFile, err := os.Open("task.json") taskFileName := "task.json"
if len(os.Args) > 1 {
taskFileName = os.Args[1]
}
taskFile, err := os.Open(taskFileName)
if err != nil { if err != nil {
log.Panicf("open file: %v", err) log.Panicf("open file: %v", err)
} }
@ -35,7 +42,14 @@ func main() {
log.Panicf("read config: %v", err) log.Panicf("read config: %v", err)
} }
pwe, err := pwextractor.New(cfg) pwe, err := pwextractor.New(pwextractor.Config{
Proxy: cfg.Proxy,
DateParser: &dateparser.DateParser{
CurrentTimeFunc: func() time.Time {
return time.Date(2025, 01, 10, 10, 00, 00, 00, time.UTC)
},
},
})
if err != nil { if err != nil {
log.Panicf("create pw extractor: %v", err) log.Panicf("create pw extractor: %v", err)
} }
@ -51,6 +65,7 @@ func main() {
scrResult, err := pwe.Screenshot(task) scrResult, err := pwe.Screenshot(task)
if err != nil { if err != nil {
log.Errorf("screenshot failed: %v", err) log.Errorf("screenshot failed: %v", err)
return
} }
err = os.WriteFile("screenshot.png", scrResult.Image, 0600) err = os.WriteFile("screenshot.png", scrResult.Image, 0600)
if err != nil { if err != nil {

View File

@ -6,12 +6,14 @@ import (
"fmt" "fmt"
"github.com/egor3f/rssalchemy/internal/adapters/natsadapter" "github.com/egor3f/rssalchemy/internal/adapters/natsadapter"
"github.com/egor3f/rssalchemy/internal/config" "github.com/egor3f/rssalchemy/internal/config"
"github.com/egor3f/rssalchemy/internal/dateparser"
"github.com/egor3f/rssalchemy/internal/extractors/pwextractor" "github.com/egor3f/rssalchemy/internal/extractors/pwextractor"
"github.com/egor3f/rssalchemy/internal/models" "github.com/egor3f/rssalchemy/internal/models"
"github.com/labstack/gommon/log" "github.com/labstack/gommon/log"
"github.com/nats-io/nats.go" "github.com/nats-io/nats.go"
"os" "os"
"os/signal" "os/signal"
"time"
) )
func main() { func main() {
@ -47,7 +49,12 @@ func main() {
log.Panicf("create nats adapter: %v", err) log.Panicf("create nats adapter: %v", err)
} }
pwe, err := pwextractor.New(cfg) pwe, err := pwextractor.New(pwextractor.Config{
Proxy: cfg.Proxy,
DateParser: &dateparser.DateParser{
CurrentTimeFunc: time.Now,
},
})
if err != nil { if err != nil {
log.Panicf("create pw extractor: %v", err) log.Panicf("create pw extractor: %v", err)
} }

67
compare_revs.sh Normal file
View File

@ -0,0 +1,67 @@
#!/usr/bin/env bash
# Copy project to temp directory and then reset it to HEAD to capture output of last commited version
# Then go back to current dir, capture output of current working tree
# Compare outputs for several tasks, notify if differ
# Caveats: this test uses real websites and parsing tasks - so it's not idempotent.
# I should think about better solution
set -e
old_dir=$(mktemp -d)
cur_dir=$(pwd)
task_dir=$cur_dir/test_tasks
trap "echo cleaning up && rm -rf $old_dir && echo done" EXIT
echo "Copying project to $old_dir"
time rsync -ar --exclude "node_modules" $cur_dir/ $old_dir
cd $old_dir
git reset --hard HEAD
cd -
failed=0
for task in $task_dir/*; do
echo "Task $task"
old_out=$(mktemp)
echo "Old version output: $old_out"
cur_out=$(mktemp)
echo "Cur version output: $cur_out"
set +e
cd $old_dir
rm -f $old_dir/screenshot.png
go run github.com/egor3f/rssalchemy/cmd/extractor "$task" > $old_out 2>&1
if [ $? != 0 ]; then
echo "Failed to run old version"
cat $old_out
exit 1
fi
cd -
go run github.com/egor3f/rssalchemy/cmd/extractor "$task" > $cur_out 2>&1
if [ $? != 0 ]; then
echo "Failed to run new version"
cat $cur_out
exit 1
fi
set -e
if [ "$(cat $old_out)" != "$(cat $cur_out)" ]; then
echo "Output differ for $task. To inspect use: "
echo "diff -u $old_out $cur_out"
failed=$((failed + 1))
if [ -f $old_dir/screenshot.png ]; then
cp $old_dir/screenshot.png $cur_dir/screenshot_old.png
echo Screenshot of old version output copied to cwd
fi
fi
done
echo "-----------"
total=$(ls -1q $task_dir/* | wc -l)
echo "Failed: $failed of $total"
if [ $failed > 0 ]; then
exit 1
fi

1
go.mod
View File

@ -42,7 +42,6 @@ require (
golang.org/x/crypto v0.32.0 // indirect golang.org/x/crypto v0.32.0 // indirect
golang.org/x/exp v0.0.0-20220321173239-a90fa8a75705 // indirect golang.org/x/exp v0.0.0-20220321173239-a90fa8a75705 // indirect
golang.org/x/net v0.34.0 // indirect golang.org/x/net v0.34.0 // indirect
golang.org/x/sync v0.10.0 // indirect
golang.org/x/sys v0.29.0 // indirect golang.org/x/sys v0.29.0 // indirect
golang.org/x/text v0.21.0 // indirect golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.8.0 // indirect golang.org/x/time v0.8.0 // indirect

2
go.sum
View File

@ -109,8 +109,6 @@ golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=

View File

@ -0,0 +1,34 @@
package dateparser
import (
godateparser "github.com/markusmobius/go-dateparser"
"strings"
"time"
)
type DateParser struct {
CurrentTimeFunc func() time.Time
}
func (d *DateParser) ParseDate(str string) (time.Time, error) {
str = strings.TrimSpace(str)
dt, err := godateparser.Parse(&godateparser.Configuration{
CurrentTime: d.CurrentTimeFunc(),
}, str)
if err == nil {
return dt.Time, nil
}
parts := strings.Split(str, " ")
for len(parts) > 1 {
newStr := strings.Join(parts, " ")
dt, err = godateparser.Parse(nil, newStr)
if err == nil {
return dt.Time, err
}
parts = parts[1:]
}
return time.Time{}, err
}

View File

@ -4,12 +4,12 @@ import (
"context" "context"
_ "embed" _ "embed"
"fmt" "fmt"
"github.com/egor3f/rssalchemy/internal/config"
"github.com/egor3f/rssalchemy/internal/models" "github.com/egor3f/rssalchemy/internal/models"
"github.com/labstack/gommon/log" "github.com/labstack/gommon/log"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"maps" "maps"
"strings" "strings"
"time"
) )
// Timeouts // Timeouts
@ -24,12 +24,22 @@ var (
var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36" var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"` var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"`
type DateParser interface {
ParseDate(string) (time.Time, error)
}
type PwExtractor struct { type PwExtractor struct {
pw *playwright.Playwright pw *playwright.Playwright
chrome playwright.Browser chrome playwright.Browser
dateParser DateParser
} }
func New(cfg config.Config) (*PwExtractor, error) { type Config struct {
Proxy string
DateParser DateParser
}
func New(cfg Config) (*PwExtractor, error) {
e := PwExtractor{} e := PwExtractor{}
var err error var err error
e.pw, err = playwright.Run() e.pw, err = playwright.Run()
@ -49,6 +59,7 @@ func New(cfg config.Config) (*PwExtractor, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("run chromium: %w", err) return nil, fmt.Errorf("run chromium: %w", err)
} }
e.dateParser = cfg.DateParser
return &e, nil return &e, nil
} }
@ -142,6 +153,7 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
parser := pageParser{ parser := pageParser{
task: task, task: task,
page: page, page: page,
dateParser: e.dateParser,
} }
var err error var err error
result, err = parser.parse() result, err = parser.parse()
@ -185,6 +197,7 @@ func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTas
type pageParser struct { type pageParser struct {
task models.Task task models.Task
page playwright.Page page playwright.Page
dateParser DateParser
// next fields only for debugging. Shit code, to do better later // next fields only for debugging. Shit code, to do better later
postIdx int postIdx int
@ -303,11 +316,11 @@ func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, erro
createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText)) createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText))
log.Debugf("date=%s", createdDateStr) log.Debugf("date=%s", createdDateStr)
createdDate, err := parseDate(createdDateStr) createdDate, err := p.dateParser.ParseDate(createdDateStr)
if err != nil { if err != nil {
log.Errorf("dateparser: %v", err) log.Errorf("dateparser: %v", err)
} else { } else {
item.Created = createdDate.Time item.Created = createdDate
} }
return item, nil return item, nil

View File

@ -2,8 +2,6 @@ package pwextractor
import ( import (
"fmt" "fmt"
"github.com/markusmobius/go-dateparser"
"github.com/markusmobius/go-dateparser/date"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"net/url" "net/url"
"slices" "slices"
@ -83,24 +81,3 @@ func parseCookieString(cookieStr string) ([][2]string, error) {
return result, nil return result, nil
} }
func parseDate(str string) (d date.Date, err error) {
str = strings.TrimSpace(str)
d, err = dateparser.Parse(nil, str)
if err == nil {
return
}
parts := strings.Split(str, " ")
for len(parts) > 1 {
newStr := strings.Join(parts, " ")
d, err = dateparser.Parse(nil, newStr)
if err == nil {
return
}
parts = parts[1:]
}
return
}

13
test_tasks/vombat.json Normal file
View File

@ -0,0 +1,13 @@
{
"TaskType": "extract",
"URL": "https://vombat.su/new/all",
"SelectorPost": "div.post-body",
"SelectorTitle": "h1 a",
"SelectorLink": "h1 a",
"SelectorDescription": "div.post-content-block p",
"SelectorAuthor": "a:has(\u003e span.post-author)",
"SelectorCreated": "div:nth-of-type(1) \u003e div:nth-of-type(1) \u003e div:nth-of-type(1) \u003e div:nth-of-type(2)",
"SelectorContent": "div.post-content-block",
"SelectorEnclosure": "article img.object-contain",
"Headers": {}
}