compare revs (for testing)
Support mocking dates Support mocking dates Support mocking dates Support mocking dates
This commit is contained in:
parent
0d48fe8554
commit
925bf49a8e
6
.gitignore
vendored
6
.gitignore
vendored
@ -2,5 +2,7 @@
|
|||||||
/trash/
|
/trash/
|
||||||
/todo.md
|
/todo.md
|
||||||
/.env.dev
|
/.env.dev
|
||||||
/task.json
|
/task*.json
|
||||||
/screenshot.png
|
/screenshot*.png
|
||||||
|
node_modules
|
||||||
|
.vite
|
||||||
|
|||||||
@ -3,19 +3,26 @@ package main
|
|||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"github.com/egor3f/rssalchemy/internal/config"
|
"github.com/egor3f/rssalchemy/internal/config"
|
||||||
|
"github.com/egor3f/rssalchemy/internal/dateparser"
|
||||||
"github.com/egor3f/rssalchemy/internal/extractors/pwextractor"
|
"github.com/egor3f/rssalchemy/internal/extractors/pwextractor"
|
||||||
"github.com/egor3f/rssalchemy/internal/models"
|
"github.com/egor3f/rssalchemy/internal/models"
|
||||||
"github.com/labstack/gommon/log"
|
"github.com/labstack/gommon/log"
|
||||||
"github.com/yassinebenaid/godump"
|
"github.com/yassinebenaid/godump"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
log.SetLevel(log.DEBUG)
|
log.SetLevel(log.DEBUG)
|
||||||
log.SetHeader(`${time_rfc3339_nano} ${level}`)
|
log.SetHeader(`${level}`)
|
||||||
|
|
||||||
taskFile, err := os.Open("task.json")
|
taskFileName := "task.json"
|
||||||
|
if len(os.Args) > 1 {
|
||||||
|
taskFileName = os.Args[1]
|
||||||
|
}
|
||||||
|
|
||||||
|
taskFile, err := os.Open(taskFileName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Panicf("open file: %v", err)
|
log.Panicf("open file: %v", err)
|
||||||
}
|
}
|
||||||
@ -35,7 +42,14 @@ func main() {
|
|||||||
log.Panicf("read config: %v", err)
|
log.Panicf("read config: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
pwe, err := pwextractor.New(cfg)
|
pwe, err := pwextractor.New(pwextractor.Config{
|
||||||
|
Proxy: cfg.Proxy,
|
||||||
|
DateParser: &dateparser.DateParser{
|
||||||
|
CurrentTimeFunc: func() time.Time {
|
||||||
|
return time.Date(2025, 01, 10, 10, 00, 00, 00, time.UTC)
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Panicf("create pw extractor: %v", err)
|
log.Panicf("create pw extractor: %v", err)
|
||||||
}
|
}
|
||||||
@ -51,6 +65,7 @@ func main() {
|
|||||||
scrResult, err := pwe.Screenshot(task)
|
scrResult, err := pwe.Screenshot(task)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("screenshot failed: %v", err)
|
log.Errorf("screenshot failed: %v", err)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
err = os.WriteFile("screenshot.png", scrResult.Image, 0600)
|
err = os.WriteFile("screenshot.png", scrResult.Image, 0600)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@ -6,12 +6,14 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"github.com/egor3f/rssalchemy/internal/adapters/natsadapter"
|
"github.com/egor3f/rssalchemy/internal/adapters/natsadapter"
|
||||||
"github.com/egor3f/rssalchemy/internal/config"
|
"github.com/egor3f/rssalchemy/internal/config"
|
||||||
|
"github.com/egor3f/rssalchemy/internal/dateparser"
|
||||||
"github.com/egor3f/rssalchemy/internal/extractors/pwextractor"
|
"github.com/egor3f/rssalchemy/internal/extractors/pwextractor"
|
||||||
"github.com/egor3f/rssalchemy/internal/models"
|
"github.com/egor3f/rssalchemy/internal/models"
|
||||||
"github.com/labstack/gommon/log"
|
"github.com/labstack/gommon/log"
|
||||||
"github.com/nats-io/nats.go"
|
"github.com/nats-io/nats.go"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@ -47,7 +49,12 @@ func main() {
|
|||||||
log.Panicf("create nats adapter: %v", err)
|
log.Panicf("create nats adapter: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
pwe, err := pwextractor.New(cfg)
|
pwe, err := pwextractor.New(pwextractor.Config{
|
||||||
|
Proxy: cfg.Proxy,
|
||||||
|
DateParser: &dateparser.DateParser{
|
||||||
|
CurrentTimeFunc: time.Now,
|
||||||
|
},
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Panicf("create pw extractor: %v", err)
|
log.Panicf("create pw extractor: %v", err)
|
||||||
}
|
}
|
||||||
|
|||||||
67
compare_revs.sh
Normal file
67
compare_revs.sh
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Copy project to temp directory and then reset it to HEAD to capture output of last commited version
|
||||||
|
# Then go back to current dir, capture output of current working tree
|
||||||
|
# Compare outputs for several tasks, notify if differ
|
||||||
|
# Caveats: this test uses real websites and parsing tasks - so it's not idempotent.
|
||||||
|
# I should think about better solution
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
old_dir=$(mktemp -d)
|
||||||
|
cur_dir=$(pwd)
|
||||||
|
task_dir=$cur_dir/test_tasks
|
||||||
|
|
||||||
|
trap "echo cleaning up && rm -rf $old_dir && echo done" EXIT
|
||||||
|
|
||||||
|
echo "Copying project to $old_dir"
|
||||||
|
time rsync -ar --exclude "node_modules" $cur_dir/ $old_dir
|
||||||
|
cd $old_dir
|
||||||
|
git reset --hard HEAD
|
||||||
|
cd -
|
||||||
|
|
||||||
|
failed=0
|
||||||
|
|
||||||
|
for task in $task_dir/*; do
|
||||||
|
echo "Task $task"
|
||||||
|
old_out=$(mktemp)
|
||||||
|
echo "Old version output: $old_out"
|
||||||
|
cur_out=$(mktemp)
|
||||||
|
echo "Cur version output: $cur_out"
|
||||||
|
|
||||||
|
set +e
|
||||||
|
cd $old_dir
|
||||||
|
rm -f $old_dir/screenshot.png
|
||||||
|
go run github.com/egor3f/rssalchemy/cmd/extractor "$task" > $old_out 2>&1
|
||||||
|
if [ $? != 0 ]; then
|
||||||
|
echo "Failed to run old version"
|
||||||
|
cat $old_out
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
cd -
|
||||||
|
go run github.com/egor3f/rssalchemy/cmd/extractor "$task" > $cur_out 2>&1
|
||||||
|
if [ $? != 0 ]; then
|
||||||
|
echo "Failed to run new version"
|
||||||
|
cat $cur_out
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ "$(cat $old_out)" != "$(cat $cur_out)" ]; then
|
||||||
|
echo "Output differ for $task. To inspect use: "
|
||||||
|
echo "diff -u $old_out $cur_out"
|
||||||
|
failed=$((failed + 1))
|
||||||
|
if [ -f $old_dir/screenshot.png ]; then
|
||||||
|
cp $old_dir/screenshot.png $cur_dir/screenshot_old.png
|
||||||
|
echo Screenshot of old version output copied to cwd
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "-----------"
|
||||||
|
total=$(ls -1q $task_dir/* | wc -l)
|
||||||
|
echo "Failed: $failed of $total"
|
||||||
|
|
||||||
|
if [ $failed > 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
1
go.mod
1
go.mod
@ -42,7 +42,6 @@ require (
|
|||||||
golang.org/x/crypto v0.32.0 // indirect
|
golang.org/x/crypto v0.32.0 // indirect
|
||||||
golang.org/x/exp v0.0.0-20220321173239-a90fa8a75705 // indirect
|
golang.org/x/exp v0.0.0-20220321173239-a90fa8a75705 // indirect
|
||||||
golang.org/x/net v0.34.0 // indirect
|
golang.org/x/net v0.34.0 // indirect
|
||||||
golang.org/x/sync v0.10.0 // indirect
|
|
||||||
golang.org/x/sys v0.29.0 // indirect
|
golang.org/x/sys v0.29.0 // indirect
|
||||||
golang.org/x/text v0.21.0 // indirect
|
golang.org/x/text v0.21.0 // indirect
|
||||||
golang.org/x/time v0.8.0 // indirect
|
golang.org/x/time v0.8.0 // indirect
|
||||||
|
|||||||
2
go.sum
2
go.sum
@ -109,8 +109,6 @@ golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
|
|||||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
|
|
||||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
|
||||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
|||||||
34
internal/dateparser/dateparser.go
Normal file
34
internal/dateparser/dateparser.go
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
package dateparser
|
||||||
|
|
||||||
|
import (
|
||||||
|
godateparser "github.com/markusmobius/go-dateparser"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type DateParser struct {
|
||||||
|
CurrentTimeFunc func() time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *DateParser) ParseDate(str string) (time.Time, error) {
|
||||||
|
str = strings.TrimSpace(str)
|
||||||
|
|
||||||
|
dt, err := godateparser.Parse(&godateparser.Configuration{
|
||||||
|
CurrentTime: d.CurrentTimeFunc(),
|
||||||
|
}, str)
|
||||||
|
if err == nil {
|
||||||
|
return dt.Time, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
parts := strings.Split(str, " ")
|
||||||
|
for len(parts) > 1 {
|
||||||
|
newStr := strings.Join(parts, " ")
|
||||||
|
dt, err = godateparser.Parse(nil, newStr)
|
||||||
|
if err == nil {
|
||||||
|
return dt.Time, err
|
||||||
|
}
|
||||||
|
parts = parts[1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
return time.Time{}, err
|
||||||
|
}
|
||||||
@ -4,12 +4,12 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
_ "embed"
|
_ "embed"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/egor3f/rssalchemy/internal/config"
|
|
||||||
"github.com/egor3f/rssalchemy/internal/models"
|
"github.com/egor3f/rssalchemy/internal/models"
|
||||||
"github.com/labstack/gommon/log"
|
"github.com/labstack/gommon/log"
|
||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
"maps"
|
"maps"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Timeouts
|
// Timeouts
|
||||||
@ -24,12 +24,22 @@ var (
|
|||||||
var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
|
var userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
|
||||||
var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"`
|
var secChUa = `"Chromium";v="132", "Google Chrome";v="132", "Not-A.Brand";v="99"`
|
||||||
|
|
||||||
type PwExtractor struct {
|
type DateParser interface {
|
||||||
pw *playwright.Playwright
|
ParseDate(string) (time.Time, error)
|
||||||
chrome playwright.Browser
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func New(cfg config.Config) (*PwExtractor, error) {
|
type PwExtractor struct {
|
||||||
|
pw *playwright.Playwright
|
||||||
|
chrome playwright.Browser
|
||||||
|
dateParser DateParser
|
||||||
|
}
|
||||||
|
|
||||||
|
type Config struct {
|
||||||
|
Proxy string
|
||||||
|
DateParser DateParser
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(cfg Config) (*PwExtractor, error) {
|
||||||
e := PwExtractor{}
|
e := PwExtractor{}
|
||||||
var err error
|
var err error
|
||||||
e.pw, err = playwright.Run()
|
e.pw, err = playwright.Run()
|
||||||
@ -49,6 +59,7 @@ func New(cfg config.Config) (*PwExtractor, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("run chromium: %w", err)
|
return nil, fmt.Errorf("run chromium: %w", err)
|
||||||
}
|
}
|
||||||
|
e.dateParser = cfg.DateParser
|
||||||
return &e, nil
|
return &e, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,8 +151,9 @@ func (e *PwExtractor) visitPage(task models.Task, cb func(page playwright.Page)
|
|||||||
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
|
func (e *PwExtractor) Extract(task models.Task) (result *models.TaskResult, errRet error) {
|
||||||
errRet = e.visitPage(task, func(page playwright.Page) error {
|
errRet = e.visitPage(task, func(page playwright.Page) error {
|
||||||
parser := pageParser{
|
parser := pageParser{
|
||||||
task: task,
|
task: task,
|
||||||
page: page,
|
page: page,
|
||||||
|
dateParser: e.dateParser,
|
||||||
}
|
}
|
||||||
var err error
|
var err error
|
||||||
result, err = parser.parse()
|
result, err = parser.parse()
|
||||||
@ -183,8 +195,9 @@ func (e *PwExtractor) Screenshot(task models.Task) (result *models.ScreenshotTas
|
|||||||
}
|
}
|
||||||
|
|
||||||
type pageParser struct {
|
type pageParser struct {
|
||||||
task models.Task
|
task models.Task
|
||||||
page playwright.Page
|
page playwright.Page
|
||||||
|
dateParser DateParser
|
||||||
|
|
||||||
// next fields only for debugging. Shit code, to do better later
|
// next fields only for debugging. Shit code, to do better later
|
||||||
postIdx int
|
postIdx int
|
||||||
@ -303,11 +316,11 @@ func (p *pageParser) extractPost(post playwright.Locator) (models.FeedItem, erro
|
|||||||
|
|
||||||
createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText))
|
createdDateStr := p.must(post.Locator(p.task.SelectorCreated).First().InnerText(defOptInText))
|
||||||
log.Debugf("date=%s", createdDateStr)
|
log.Debugf("date=%s", createdDateStr)
|
||||||
createdDate, err := parseDate(createdDateStr)
|
createdDate, err := p.dateParser.ParseDate(createdDateStr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("dateparser: %v", err)
|
log.Errorf("dateparser: %v", err)
|
||||||
} else {
|
} else {
|
||||||
item.Created = createdDate.Time
|
item.Created = createdDate
|
||||||
}
|
}
|
||||||
|
|
||||||
return item, nil
|
return item, nil
|
||||||
|
|||||||
@ -2,8 +2,6 @@ package pwextractor
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/markusmobius/go-dateparser"
|
|
||||||
"github.com/markusmobius/go-dateparser/date"
|
|
||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
"net/url"
|
"net/url"
|
||||||
"slices"
|
"slices"
|
||||||
@ -83,24 +81,3 @@ func parseCookieString(cookieStr string) ([][2]string, error) {
|
|||||||
|
|
||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseDate(str string) (d date.Date, err error) {
|
|
||||||
str = strings.TrimSpace(str)
|
|
||||||
|
|
||||||
d, err = dateparser.Parse(nil, str)
|
|
||||||
if err == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
parts := strings.Split(str, " ")
|
|
||||||
for len(parts) > 1 {
|
|
||||||
newStr := strings.Join(parts, " ")
|
|
||||||
d, err = dateparser.Parse(nil, newStr)
|
|
||||||
if err == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
parts = parts[1:]
|
|
||||||
}
|
|
||||||
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|||||||
13
test_tasks/vombat.json
Normal file
13
test_tasks/vombat.json
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"TaskType": "extract",
|
||||||
|
"URL": "https://vombat.su/new/all",
|
||||||
|
"SelectorPost": "div.post-body",
|
||||||
|
"SelectorTitle": "h1 a",
|
||||||
|
"SelectorLink": "h1 a",
|
||||||
|
"SelectorDescription": "div.post-content-block p",
|
||||||
|
"SelectorAuthor": "a:has(\u003e span.post-author)",
|
||||||
|
"SelectorCreated": "div:nth-of-type(1) \u003e div:nth-of-type(1) \u003e div:nth-of-type(1) \u003e div:nth-of-type(2)",
|
||||||
|
"SelectorContent": "div.post-content-block",
|
||||||
|
"SelectorEnclosure": "article img.object-contain",
|
||||||
|
"Headers": {}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user