Created
September 7, 2019 09:11
-
-
Save michalbcz/0fd5e3249ad541376412eb63ac39a0e6 to your computer and use it in GitHub Desktop.
Puppetteer based nodejs scraper of https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky including categorizing to groups
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer') | |
const fs = require('fs') | |
// this wrapper means immediatelly execute this code | |
void(async () => { | |
const url = 'https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky' | |
try { | |
console.log("I am scraping questions from " + url) | |
const browser = await puppeteer.launch({ | |
// headless: false, // launch headful mode - good for debugging purposes (you will see what happened inside browser) | |
// slowMo: 50, // slow down puppeteer script so that it's easier to follow visually | |
}) | |
const page = await browser.newPage() | |
// use nodejs console logging from inside of evaluate (where scraping script is running) | |
/* | |
page.on('console', consoleMessageObject => function (consoleMessageObject) { | |
if (consoleMessageObject._type !== 'warning') { | |
console.log(consoleMessageObject._text) | |
} | |
}); | |
*/ | |
await page.goto(url) | |
console.log("Browser opened, starting to evaluate scraping script...") | |
console.log("Scraping groups and its links") | |
const groups = await extractGroup(page) | |
let allQuestions = [] | |
for (let i = 0; i < groups.length; i++) { | |
const group = groups[i] | |
if (groupHasSubgroups(group, groups)) { | |
console.log( | |
` | |
Skipping group ${group.groupId} with sub groups. Otherwise we would have duplicated | |
questions (from parent group and its subgroups), because parent group link leads to all questions for it's subgroups. | |
` | |
) | |
continue // skip this group | |
} | |
console.log("Extracting questions for group", group) | |
await Promise.all([ | |
page.evaluate((groupName) => { | |
let groupItems = document.querySelectorAll('form#setup div.menu div.item') | |
let groupItemElement = Array.from(groupItems).find((groupItem) => groupItem.textContent.includes(groupName)) | |
groupItemElement.click() | |
}, group.name), | |
page.waitForNavigation() | |
]) | |
const questions = await page.evaluate(extractQuestions) | |
let questionsForGroup = questions.map((question, index) => { | |
let newQuestion = { ...question } | |
newQuestion.groupId = group.groupId | |
return newQuestion | |
}) | |
allQuestions = allQuestions.concat(questionsForGroup) | |
} | |
await browser.close() | |
console.log('Scraping is done. Browser is closed. We scraped', allQuestions.length, 'questions') | |
//questions.forEach(it => console.log(it)) | |
const resultJson = { | |
groups: groups, | |
questions: allQuestions | |
} | |
const fileUri = './questions.json' | |
console.log('Writing questions to file:', fileUri) | |
fs.writeFile(fileUri, JSON.stringify(resultJson, null, '\t' /* pretty-print */), (err) => { | |
if (err) { | |
console.error("Cannot write file questions.json", err) | |
} | |
console.info("File saved! Goodbye!") | |
}) | |
} catch (error) { | |
console.error(error) | |
} | |
})(); | |
function extractQuestions() { | |
console.log('Extracting questions...') | |
const questionsParentDiv = document.querySelectorAll('div#questions > div') | |
const questions = Array.from(questionsParentDiv).map((el, index) => { | |
const rows = el.querySelectorAll('div.row') | |
const questionText = rows[0].innerText.trim() | |
const question = { | |
order: index, | |
question: { | |
text : questionText | |
}, | |
answers: [] | |
} | |
// answers | |
for (let i = 1; i < rows.length; i++) { | |
let answerRow = rows[i]; | |
const isCorrect = answerRow.className.includes("correct-answer") | |
const rawAnswerText = answerRow.innerText | |
const answerText = rawAnswerText.replace(/^[a-z]\)/,"").trim() | |
question.answers.push({ | |
answerText: answerText, | |
isCorrect: isCorrect | |
}) | |
} | |
return question | |
}) | |
return questions | |
} | |
async function extractGroup(page) { | |
const groupLinks = await page.$$('form#setup div.menu div.item') | |
console.log("Group links size", groupLinks.length) | |
let nextParentGroupId = null; | |
let groups = [] | |
for(let i = 1; i < groupLinks.length; i++) { | |
const groupLinkElement = groupLinks[i] | |
const thisIsSubGroup = await groupLinkElement.$("i.level") | |
if (!thisIsSubGroup) { | |
nextParentGroupId = null | |
} | |
let groupName = await groupLinkElement.$("span") != null ? await groupLinkElement.$eval("span", (node) => node.textContent) : await page.evaluate((node) => node.textContent, groupLinkElement) // first span contains name } | |
groupName = groupName.replace(/\n/, '').trim() | |
const group = { | |
groupId: i, | |
parentGroupId: nextParentGroupId, | |
name: groupName | |
} | |
groups.push(group) | |
// set parentGroupId for following subgroups | |
if (!thisIsSubGroup) { | |
nextParentGroupId = group.groupId | |
} | |
} | |
return groups | |
} | |
function groupHasSubgroups(group, groups) { | |
return groups.filter((it) => group.groupId === it.parentGroupId).length > 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment