compare revs (for testing)
Support mocking dates Support mocking dates Support mocking dates Support mocking dates
This commit is contained in:
parent
d54e464816
commit
728792da26
6
.gitignore
vendored
6
.gitignore
vendored
@ -2,5 +2,7 @@
|
||||
/trash/
|
||||
/todo.md
|
||||
/.env.dev
|
||||
/task.json
|
||||
/screenshot.png
|
||||
/task*.json
|
||||
/screenshot*.png
|
||||
node_modules
|
||||
.vite
|
||||
|
||||
@ -3,19 +3,26 @@ package main
|
||||
import (
|
||||
"encoding/json"
|
||||
"github.com/egor3f/rssalchemy/internal/config"
|
||||
"github.com/egor3f/rssalchemy/internal/dateparser"
|
||||
"github.com/egor3f/rssalchemy/internal/extractors/pwextractor"
|
||||
"github.com/egor3f/rssalchemy/internal/models"
|
||||
"github.com/labstack/gommon/log"
|
||||
"github.com/yassinebenaid/godump"
|
||||
"io"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
func main() {
|
||||
log.SetLevel(log.DEBUG)
|
||||
log.SetHeader(`${time_rfc3339_nano} ${level}`)
|
||||
log.SetHeader(`${level}`)
|
||||
|
||||
taskFile, err := os.Open("task.json")
|
||||
taskFileName := "task.json"
|
||||
if len(os.Args) > 1 {
|
||||
taskFileName = os.Args[1]
|
||||
}
|
||||
|
||||
taskFile, err := os.Open(taskFileName)
|
||||
if err != nil {
|
||||
log.Panicf("open file: %v", err)
|
||||
}
|
||||
@ -35,7 +42,14 @@ func main() {
|
||||
log.Panicf("read config: %v", err)
|
||||
}
|
||||
|
||||
pwe, err := pwextractor.New(cfg)
|
||||
pwe, err := pwextractor.New(pwextractor.Config{
|
||||
Proxy: cfg.Proxy,
|
||||
DateParser: &dateparser.DateParser{
|
||||
CurrentTimeFunc: func() time.Time {
|
||||
return time.Date(2025, 01, 10, 10, 00, 00, 00, time.UTC)
|
||||
},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Panicf("create pw extractor: %v", err)
|
||||
}
|
||||
@ -51,6 +65,7 @@ func main() {
|
||||
scrResult, err := pwe.Screenshot(task)
|
||||
if err != nil {
|
||||
log.Errorf("screenshot failed: %v", err)
|
||||
return
|
||||
}
|
||||
err = os.WriteFile("screenshot.png", scrResult.Image, 0600)
|
||||
if err != nil {
|
||||
|
||||
@ -6,12 +6,14 @@ import (
|
||||
"fmt"
|
||||
"github.com/egor3f/rssalchemy/internal/adapters/natsadapter"
|
||||
"github.com/egor3f/rssalchemy/internal/config"
|
||||
"github.com/egor3f/rssalchemy/internal/dateparser"
|
||||
"github.com/egor3f/rssalchemy/internal/extractors/pwextractor"
|
||||
"github.com/egor3f/rssalchemy/internal/models"
|
||||
"github.com/labstack/gommon/log"
|
||||
"github.com/nats-io/nats.go"
|
||||
"os"
|
||||
"os/signal"
|
||||
"time"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@ -47,7 +49,12 @@ func main() {
|
||||
log.Panicf("create nats adapter: %v", err)
|
||||
}
|
||||
|
||||
pwe, err := pwextractor.New(cfg)
|
||||
pwe, err := pwextractor.New(pwextractor.Config{
|
||||
Proxy: cfg.Proxy,
|
||||
DateParser: &dateparser.DateParser{
|
||||
CurrentTimeFunc: time.Now,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Panicf("create pw extractor: %v", err)
|
||||
}
|
||||
|
||||
67
compare_revs.sh
Normal file
67
compare_revs.sh
Normal file
@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Copy project to temp directory and then reset it to HEAD to capture output of last commited version
|
||||
# Then go back to current dir, capture output of current working tree
|
||||
# Compare outputs for several tasks, notify if differ
|
||||
# Caveats: this test uses real websites and parsing tasks - so it's not idempotent.
|
||||
# I should think about better solution
|
||||
|
||||
set -e
|
||||
|
||||
old_dir=$(mktemp -d)
|
||||
cur_dir=$(pwd)
|
||||
task_dir=$cur_dir/test_tasks
|
||||
|
||||
trap "echo cleaning up && rm -rf $old_dir && echo done" EXIT
|
||||
|
||||
echo "Copying project to $old_dir"
|
||||
time rsync -ar --exclude "node_modules" $cur_dir/ $old_dir
|
||||
cd $old_dir
|
||||
git reset --hard HEAD
|
||||
cd -
|
||||
|
||||
failed=0
|
||||
|
||||
for task in $task_dir/*; do
|
||||
echo "Task $task"
|
||||
old_out=$(mktemp)
|
||||
echo "Old version output: $old_out"
|
||||
cur_out=$(mktemp)
|
||||
echo "Cur version output: $cur_out"
|
||||
|
||||
set +e
|
||||
cd $old_dir
|
||||
rm -f $old_dir/screenshot.png
|
||||
go run github.com/egor3f/rssalchemy/cmd/extractor "$task" > $old_out 2>&1
|
||||
if [ $? != 0 ]; then
|
||||
echo "Failed to run old version"
|
||||
cat $old_out
|
||||
exit 1
|
||||
fi
|
||||
cd -
|
||||
go run github.com/egor3f/rssalchemy/cmd/extractor "$task" > $cur_out 2>&1
|
||||
if [ $? != 0 ]; then
|
||||
echo "Failed to run new version"
|
||||
cat $cur_out
|
||||
exit 1
|
||||
fi
|
||||
set -e
|
||||
|
||||
if [ "$(cat $old_out)" != "$(cat $cur_out)" ]; then
|
||||
echo "Output differ for $task. To inspect use: "
|
||||
echo "diff -u $old_out $cur_out"
|
||||
failed=$((failed + 1))
|
||||
if [ -f $old_dir/screenshot.png ]; then
|
||||
cp $old_dir/screenshot.png $cur_dir/screenshot_old.png
|
||||
echo Screenshot of old version output copied to cwd
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "-----------"
|
||||
total=$(ls -1q $task_dir/* | wc -l)
|
||||
echo "Failed: $failed of $total"
|
||||
|
||||
if [ $failed > 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
1
go.mod
1
go.mod
@ -42,7 +42,6 @@ require (
|
||||
golang.org/x/crypto v0.32.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20220321173239-a90fa8a75705 // indirect
|
||||
golang.org/x/net v0.34.0 // indirect
|
||||
golang.org/x/sync v0.10.0 // indirect
|
||||
golang.org/x/sys v0.29.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
golang.org/x/time v0.8.0 // indirect
|
||||
|
||||
2
go.sum
2
go.sum
@ -109,8 +109,6 @@ golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
|
||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
|
||||
34
internal/dateparser/dateparser.go
Normal file
34
internal/dateparser/dateparser.go
Normal file
@ -0,0 +1,34 @@
|
||||
package dateparser
|
||||
|
||||
import (
|
||||
godateparser "github.com/markusmobius/go-dateparser"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type DateParser struct {
|
||||
CurrentTimeFunc func() time.Time
|
||||
}
|
||||
|
||||
func (d *DateParser) ParseDate(str string) (time.Time, error) {
|
||||
str = strings.TrimSpace(str)
|
||||
|
||||
dt, err := godateparser.Parse(&godateparser.Configuration{
|
||||
CurrentTime: d.CurrentTimeFunc(),
|
||||
}, str)
|
||||
if err == nil {
|
||||
return dt.Time, nil
|
||||
}
|
||||
|
||||
parts := strings.Split(str, " ")
|
||||
for len(parts) > 1 {
|
||||
newStr := strings.Join(parts, " ")
|
||||
dt, err = godateparser.Parse(nil, newStr)
|
||||
if err == nil {
|
||||
return dt.Time, err
|
||||
}
|
||||
parts = parts[1:]
|
||||
}
|
||||
|
||||
return time.Time{}, err
|
||||
}
|
||||
@ -4,12 +4,12 @@ import (
|
||||
"context"
|
||||
_ "embed"
|
||||
"fmt"
|
||||
"github.com/egor3f/rssalchemy/internal/config"
|
||||
"github.com/egor3f/rssalchemy/internal/models"
|
||||
"github.com/labstack/gommon/log"
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"maps"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Timeouts
|
||||
@ -24,12 +24,22 @@ var (
|
||||
var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
|
||||
var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"`
|
||||
|
||||
type DateParser interface {
|
||||
ParseDate(string) (time.Time, error)
|
||||
}
|
||||
|
||||
type PwExtractor struct {
|
||||
pw *playwright.Playwright
|
||||
chrome playwright.Browser
|
||||
dateParser DateParser
|
||||
}
|
||||
|
||||
func New(cfg config.Config) (*PwExtractor, error) {
|
||||
type Config struct {
|
||||
Proxy string
|
||||
DateParser DateParser
|
||||
}
|
||||
|
||||
func New(cfg Config) (*PwExtractor, error) {
|
||||
e := PwExtractor{}
|
||||
var err error
|
||||
e.pw, err = playwright.Run()
|
||||
@ -49,6 +59,7 @@ func New(cfg config.Config) (*PwExtractor, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("run chromium: %w", err)
|
||||
}
|
||||
e.dateParser = cfg.DateParser
|
||||
return &e, nil
|
||||
}
|
||||
|
||||
@ -142,6 +153,7 @@ func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errR
|
||||
parser := pageParser{
|
||||
task: task,
|
||||
page: page,
|
||||
dateParser: e.dateParser,
|
||||
}
|
||||
var err error
|
||||
result, err = parser.parse()
|
||||
@ -185,6 +197,7 @@ func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTas
|
||||
type pageParser struct {
|
||||
task models.Task
|
||||
page playwright.Page
|
||||
dateParser DateParser
|
||||
|
||||
// next fields only for debugging. Shit code, to do better later
|
||||
postIdx int
|
||||
@ -303,11 +316,11 @@ func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, erro
|
||||
|
||||
createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText))
|
||||
log.Debugf("date=%s", createdDateStr)
|
||||
createdDate, err := parseDate(createdDateStr)
|
||||
createdDate, err := p.dateParser.ParseDate(createdDateStr)
|
||||
if err != nil {
|
||||
log.Errorf("dateparser: %v", err)
|
||||
} else {
|
||||
item.Created = createdDate.Time
|
||||
item.Created = createdDate
|
||||
}
|
||||
|
||||
return item, nil
|
||||
|
||||
@ -2,8 +2,6 @@ package pwextractor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/markusmobius/go-dateparser"
|
||||
"github.com/markusmobius/go-dateparser/date"
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"net/url"
|
||||
"slices"
|
||||
@ -83,24 +81,3 @@ func parseCookieString(cookieStr string) ([][2]string, error) {
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func parseDate(str string) (d date.Date, err error) {
|
||||
str = strings.TrimSpace(str)
|
||||
|
||||
d, err = dateparser.Parse(nil, str)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
parts := strings.Split(str, " ")
|
||||
for len(parts) > 1 {
|
||||
newStr := strings.Join(parts, " ")
|
||||
d, err = dateparser.Parse(nil, newStr)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
parts = parts[1:]
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
13
test_tasks/vombat.json
Normal file
13
test_tasks/vombat.json
Normal file
@ -0,0 +1,13 @@
|
||||
{
|
||||
"TaskType": "extract",
|
||||
"URL": "https://vombat.su/new/all",
|
||||
"SelectorPost": "div.post-body",
|
||||
"SelectorTitle": "h1 a",
|
||||
"SelectorLink": "h1 a",
|
||||
"SelectorDescription": "div.post-content-block p",
|
||||
"SelectorAuthor": "a:has(\u003e span.post-author)",
|
||||
"SelectorCreated": "div:nth-of-type(1) \u003e div:nth-of-type(1) \u003e div:nth-of-type(1) \u003e div:nth-of-type(2)",
|
||||
"SelectorContent": "div.post-content-block",
|
||||
"SelectorEnclosure": "article img.object-contain",
|
||||
"Headers": {}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user