-
-
Save jalbertbowden/19a85282480803d7dfa9e7891585bc05 to your computer and use it in GitHub Desktop.
tableau-covid-scraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Works with Node v12.0 and puppeteer 4.0 | |
const URL = 'https://public.tableau.com/views/PPV_15924847800480/ppv_db?%3Aembed=y&%3AshowVizHome=no&%3Adisplay_count=y&%3Adisplay_static_image=n&%3AbootstrapWhenNotified=true&%3Alanguage=en&:embed=y&:showVizHome=n&:apiID=host0'; | |
const puppeteer = require('puppeteer'); | |
function parseDataDictionary(jsonParsed) { | |
let dataColumns = jsonParsed[1].secondaryInfo.presModelMap.dataDictionary.presModelHolder | |
.genDataDictionaryPresModel.dataSegments["0"].dataColumns; | |
let cstring = dataColumns[1].dataValues; | |
// The full data dictionary: | |
// console.log( JSON.stringify(cstring) ); | |
let paneColumnsList = jsonParsed[1].secondaryInfo.presModelMap.vizData.presModelHolder | |
.genPresModelMapPresModel.presModelMap.PPV.presModelHolder.genVizDataPresModel | |
.paneColumnsData.paneColumnsList; | |
// Tests Per 10K Residents | |
let output = ''; | |
let length = paneColumnsList[1].vizPaneColumns[2].aliasIndices.length; | |
// Label | |
output += ( cstring[ paneColumnsList[1].vizPaneColumns[3].aliasIndices[0] ] + ','); | |
output += ( "Date\n"); | |
for (let i=0;i<length;i++) { | |
output += ( cstring[ -1 * paneColumnsList[1].vizPaneColumns[1].aliasIndices[i] - 1 ] + ','); | |
output += ( '"' + cstring[ -1 * paneColumnsList[1].vizPaneColumns[2].aliasIndices[i] - 1 ] + '"' + "\n"); | |
} | |
console.log( output ); | |
// Percent Positive | |
output = ''; | |
length = paneColumnsList[0].vizPaneColumns[3].aliasIndices.length; | |
// Label | |
output += ( cstring[ paneColumnsList[0].vizPaneColumns[3].aliasIndices[0] ] + ','); | |
output += ( "Date\n"); | |
for (let i=0;i<length;i++) { | |
output += ( cstring[ -1 * paneColumnsList[0].vizPaneColumns[2].aliasIndices[i] - 1 ] + ','); | |
output += ( '"' + cstring[ -1 * paneColumnsList[0].vizPaneColumns[1].aliasIndices[i] - 1 ] + '"' + "\n"); | |
} | |
console.log( output ); | |
} | |
// Below, largely cribbed from Thomas Dondorf at https://stackoverflow.com/questions/52969381/how-can-i-capture-all-network-requests-and-full-response-data-when-loading-a-pag | |
(async () => { | |
const browser = await puppeteer.launch(); | |
const [page] = await browser.pages(); | |
let paused = false; | |
let pausedRequests = []; | |
const nextRequest = () => { // continue the next request or "unpause" | |
if (pausedRequests.length === 0) { | |
paused = false; | |
} else { | |
// continue first request in "queue" | |
(pausedRequests.shift())(); // calls the request.continue function | |
} | |
}; | |
await page.setRequestInterception(true); | |
page.on('request', request => { | |
if (paused) { | |
pausedRequests.push(() => request.continue()); | |
} else { | |
paused = true; // pause, as we are processing a request now | |
request.continue(); | |
} | |
}); | |
page.on('requestfinished', async (request) => { | |
const response = await request.response(); | |
let responseBody; | |
if (request.url().includes('bootstrapSession')){ | |
responseBody = await response.buffer(); | |
responseBody = responseBody.toString(); | |
responseBody = responseBody.replace(/^\d+;{/g,'{'); | |
responseBody = responseBody.replace(/\d+;{/g,',{'); | |
responseBody = '[' + responseBody + ']'; | |
let jsonParsed = JSON.parse( responseBody ); | |
parseDataDictionary(jsonParsed); | |
} | |
nextRequest(); // continue with next request | |
}); | |
page.on('requestfailed', (request) => { | |
// handle failed request | |
nextRequest(); | |
}); | |
await page.goto(URL, { waitUntil: 'networkidle0' }); | |
await browser.close(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment