If you execute ./run.sh browsertrix-crawler will be started up to crawl https://www.trm.dk/nyheder and run a behaviour to fetch all the page results and then feed all the discovered URLs to the crawl queue.
Last active
October 1, 2025 21:49
-
-
Save edsu/88b65a8f658b537831c6cfdc12e88a71 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class TrmBehavior { | |
| static id = "TrmBehavior"; | |
| static isMatch() { | |
| return document.location.href == 'https://www.trm.dk/nyheder'; | |
| } | |
| static init() { | |
| return {}; | |
| } | |
| static runInIframes = false; | |
| async* run(ctx) { | |
| const {sleep, getState, addLink} = ctx.Lib; | |
| // keep clicking the button for more news stories from the archive | |
| while (true) { | |
| window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' }); | |
| const button = document.querySelector("button#pagenation"); | |
| if (button.checkVisibility()) { | |
| yield getState(ctx, "clicking pagination button"); | |
| button.click(); | |
| } else { | |
| yield getState(ctx, "finished pagination"); | |
| break; | |
| } | |
| await sleep(1000); | |
| } | |
| // add each news story URL to the queue | |
| for (const a of document.querySelectorAll(".col-md-8 a")) { | |
| yield getState(ctx, `adding ${a.href}`, "links"); | |
| ctx.Lib.addLink(a.href) | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| docker pull webrecorder/browsertrix-crawler:latest | |
| docker run -p 9037:9037 --rm -v $PWD:/crawls/ webrecorder/browsertrix-crawler:latest crawl \ | |
| --url https://www.trm.dk/nyheder \ | |
| --scopeType any \ | |
| --generateWACZ true \ | |
| --screencastPort 9037 \ | |
| --behaviors siteSpecific \ | |
| --behaviorTimeout 3600 \ | |
| --customBehaviors /crawls/behavior.js |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment