michalbcz · September 7, 2019 09:11
diff --git a/zbranekvalitne-questing-including-groups-scraper.js b/zbranekvalitne-questing-including-groups-scraper.js
 const puppeteer = require('puppeteer')
 const fs = require('fs')

 // this wrapper means immediatelly execute this code
 void(async () => {
    const url = 'https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky'

    try {
        console.log("I am scraping questions from " + url)
        const browser = await puppeteer.launch({               
            // headless: false, // launch headful mode - good for debugging purposes (you will see what happened inside browser)
            // slowMo: 50, // slow down puppeteer script so that it's easier to follow visually            
          })

        const page = await browser.newPage()

        // use nodejs console logging from inside of evaluate (where scraping script is running)
        /*
        page.on('console', consoleMessageObject => function (consoleMessageObject) {
            if (consoleMessageObject._type !== 'warning') {
                console.log(consoleMessageObject._text)
            }
        });
        */

        await page.goto(url)
        console.log("Browser opened, starting to evaluate scraping script...")

        console.log("Scraping groups and its links")
        const groups = await extractGroup(page)        

        let allQuestions = []
        for (let i = 0; i < groups.length; i++) {            
            const group = groups[i]

            if (groupHasSubgroups(group, groups)) {
                
                console.log(
                    `
                    Skipping group ${group.groupId} with sub groups. Otherwise we would have duplicated 
                    questions (from parent group and its subgroups), because parent group link leads to all questions for it's subgroups.
                    `
                )

                continue // skip this group
            }

            console.log("Extracting questions for group", group)
                    
            await Promise.all([
                page.evaluate((groupName) => {                     
                    let groupItems = document.querySelectorAll('form#setup div.menu div.item')
                    let groupItemElement = Array.from(groupItems).find((groupItem) => groupItem.textContent.includes(groupName))                    
                    groupItemElement.click() 
                }, group.name),
                page.waitForNavigation()
            ])        
        
            const questions = await page.evaluate(extractQuestions)

            let questionsForGroup = questions.map((question, index) => {
                let newQuestion = { ...question }
                newQuestion.groupId = group.groupId

                return newQuestion
            })

            allQuestions = allQuestions.concat(questionsForGroup)            
        }
               
        await browser.close()        
        console.log('Scraping is done. Browser is closed. We scraped', allQuestions.length, 'questions')
        
        //questions.forEach(it => console.log(it))

        const resultJson = {
            groups: groups,
            questions: allQuestions
        }

        const fileUri = './questions.json'
        console.log('Writing questions to file:', fileUri)
        fs.writeFile(fileUri, JSON.stringify(resultJson, null, '\t' /* pretty-print */), (err) => {
            if (err) {
                console.error("Cannot write file questions.json", err)
            } 

            console.info("File saved! Goodbye!")            
        })
               
    } catch (error) {
        console.error(error)
    }
      

 })();

 function extractQuestions() {               
    console.log('Extracting questions...')
    
    const questionsParentDiv = document.querySelectorAll('div#questions > div')          
    const questions = Array.from(questionsParentDiv).map((el, index) => {                    
        const rows = el.querySelectorAll('div.row')

        const questionText = rows[0].innerText.trim()

        const question = {
            order: index,
            question: {
                text : questionText
            },
            answers: []                    
        }

        // answers 
        for (let i = 1; i < rows.length; i++) {
            let answerRow = rows[i];

            const isCorrect = answerRow.className.includes("correct-answer")
            const rawAnswerText = answerRow.innerText
            const answerText = rawAnswerText.replace(/^[a-z]\)/,"").trim()

            question.answers.push({
                answerText: answerText,
                isCorrect: isCorrect
            })

        }

        return question

    })

    return questions    
 }

 async function extractGroup(page) {
    const groupLinks = await page.$$('form#setup div.menu div.item')
    
    console.log("Group links size", groupLinks.length)

    let nextParentGroupId = null;
    let groups = []
    for(let i = 1; i < groupLinks.length; i++) {
        const groupLinkElement = groupLinks[i]

        const thisIsSubGroup = await groupLinkElement.$("i.level")

        if (!thisIsSubGroup) { 
            nextParentGroupId = null 
        }                
        
        let groupName = await groupLinkElement.$("span") != null ? await groupLinkElement.$eval("span", (node) => node.textContent) : await page.evaluate((node) => node.textContent, groupLinkElement) // first span contains name }
        groupName = groupName.replace(/\n/, '').trim()

        const group = { 
            groupId: i,
            parentGroupId: nextParentGroupId,
            name: groupName
        }

        groups.push(group)

        // set parentGroupId for following subgroups
        if (!thisIsSubGroup) { 
            nextParentGroupId = group.groupId
        }
    } 
    
    return groups
 }

 function groupHasSubgroups(group, groups) {
    return groups.filter((it) => group.groupId === it.parentGroupId).length > 0
 }
	const puppeteer = require('puppeteer')
	const fs = require('fs')

	// this wrapper means immediatelly execute this code
	void(async () => {
	const url = 'https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky'

	try {
	console.log("I am scraping questions from " + url)
	const browser = await puppeteer.launch({
	// headless: false, // launch headful mode - good for debugging purposes (you will see what happened inside browser)
	// slowMo: 50, // slow down puppeteer script so that it's easier to follow visually
	})

	const page = await browser.newPage()

	// use nodejs console logging from inside of evaluate (where scraping script is running)
	/*
	page.on('console', consoleMessageObject => function (consoleMessageObject) {
	if (consoleMessageObject._type !== 'warning') {
	console.log(consoleMessageObject._text)
	}
	});
	*/

	await page.goto(url)
	console.log("Browser opened, starting to evaluate scraping script...")

	console.log("Scraping groups and its links")
	const groups = await extractGroup(page)

	let allQuestions = []
	for (let i = 0; i < groups.length; i++) {
	const group = groups[i]

	if (groupHasSubgroups(group, groups)) {

	console.log(
	`
	Skipping group ${group.groupId} with sub groups. Otherwise we would have duplicated
	questions (from parent group and its subgroups), because parent group link leads to all questions for it's subgroups.
	`
	)

	continue // skip this group
	}

	console.log("Extracting questions for group", group)

	await Promise.all([
	page.evaluate((groupName) => {
	let groupItems = document.querySelectorAll('form#setup div.menu div.item')
	let groupItemElement = Array.from(groupItems).find((groupItem) => groupItem.textContent.includes(groupName))
	groupItemElement.click()
	}, group.name),
	page.waitForNavigation()
	])

	const questions = await page.evaluate(extractQuestions)

	let questionsForGroup = questions.map((question, index) => {
	let newQuestion = { ...question }
	newQuestion.groupId = group.groupId

	return newQuestion
	})

	allQuestions = allQuestions.concat(questionsForGroup)
	}

	await browser.close()
	console.log('Scraping is done. Browser is closed. We scraped', allQuestions.length, 'questions')

	//questions.forEach(it => console.log(it))

	const resultJson = {
	groups: groups,
	questions: allQuestions
	}

	const fileUri = './questions.json'
	console.log('Writing questions to file:', fileUri)
	fs.writeFile(fileUri, JSON.stringify(resultJson, null, '\t' /* pretty-print */), (err) => {
	if (err) {
	console.error("Cannot write file questions.json", err)
	}

	console.info("File saved! Goodbye!")
	})

	} catch (error) {
	console.error(error)
	}


	})();

	function extractQuestions() {
	console.log('Extracting questions...')

	const questionsParentDiv = document.querySelectorAll('div#questions > div')
	const questions = Array.from(questionsParentDiv).map((el, index) => {
	const rows = el.querySelectorAll('div.row')

	const questionText = rows[0].innerText.trim()

	const question = {
	order: index,
	question: {
	text : questionText
	},
	answers: []
	}

	// answers
	for (let i = 1; i < rows.length; i++) {
	let answerRow = rows[i];

	const isCorrect = answerRow.className.includes("correct-answer")
	const rawAnswerText = answerRow.innerText
	const answerText = rawAnswerText.replace(/^[a-z]\)/,"").trim()

	question.answers.push({
	answerText: answerText,
	isCorrect: isCorrect
	})

	}

	return question

	})

	return questions
	}

	async function extractGroup(page) {
	const groupLinks = await page.$$('form#setup div.menu div.item')

	console.log("Group links size", groupLinks.length)

	let nextParentGroupId = null;
	let groups = []
	for(let i = 1; i < groupLinks.length; i++) {
	const groupLinkElement = groupLinks[i]

	const thisIsSubGroup = await groupLinkElement.$("i.level")

	if (!thisIsSubGroup) {
	nextParentGroupId = null
	}

	let groupName = await groupLinkElement.$("span") != null ? await groupLinkElement.$eval("span", (node) => node.textContent) : await page.evaluate((node) => node.textContent, groupLinkElement) // first span contains name }
	groupName = groupName.replace(/\n/, '').trim()

	const group = {
	groupId: i,
	parentGroupId: nextParentGroupId,
	name: groupName
	}

	groups.push(group)

	// set parentGroupId for following subgroups
	if (!thisIsSubGroup) {
	nextParentGroupId = group.groupId
	}
	}

	return groups
	}

	function groupHasSubgroups(group, groups) {
	return groups.filter((it) => group.groupId === it.parentGroupId).length > 0
	}