Skip to content

Instantly share code, notes, and snippets.

@railson-ferreira
Last active May 7, 2025 02:08
Show Gist options
  • Select an option

  • Save railson-ferreira/69850dfafdc638446c598c4d91b12edc to your computer and use it in GitHub Desktop.

Select an option

Save railson-ferreira/69850dfafdc638446c598c4d91b12edc to your computer and use it in GitHub Desktop.
dominiopublico.gov.br research publications randomizer
async function main() {
const urlPesquisa = "http://www.dominiopublico.gov.br/pesquisa/PesquisaObraForm.do"
const homeBody = await fetchBody(urlPesquisa+"?co_midia=2");
const regexSelect = /<select name="co_categoria"[^>]*>(.*?)<\/select>/s;
const categories = getCategories()
const categoriesToIdMap = new Map()
const regexValue = /value="(\d*)"/s
const fetchedCategories = regexSelect.exec(homeBody)[1].split("</option>").map(x=>{
const id = regexValue.exec(x)?.[1]
return [id,x.replace(/<.*>/,"").trim()];
})
fetchedCategories.forEach(fechedCategory=>{
const fechedCategoryName = fechedCategory[1]
const fechedCategoryId = fechedCategory[0]
for (const value of categories.values()) {
if (value === fechedCategoryName) {
categoriesToIdMap.set(fechedCategoryName,fechedCategoryId)
return;
}
}
})
if(!categories.size || !categoriesToIdMap.size){
throw `empty categories! list A = ${categories.size}; list B = ${categoriesToIdMap.size}`
}
if(categories.size !== categoriesToIdMap.size){
throw `categories size does not match! list A = ${categories.size}; list B = ${categoriesToIdMap.size}`
}
const minCategoryNumber = 1
const sortedCategoryNumber = getRandomIntBetween(minCategoryNumber,categories.size);
const categoryName = categories.get(sortedCategoryNumber);
const categoryId = categoriesToIdMap.get(categoryName)
console.log(`Sorted number between ${minCategoryNumber} and ${categories.size} = ${sortedCategoryNumber}`)
console.log(`Category Name: ${categories.get(sortedCategoryNumber)}`)
console.log(`Category Id: ${categoryId}`)
const urlResults = "http://www.dominiopublico.gov.br/pesquisa/ResultadoPesquisaObraForm.do"
const seachQueryParams = `?first=50&skip=0&ds_titulo=&co_autor=&no_autor=&co_categoria=${categoryId}&pagina=1&select_action=Submit&co_midia=2&co_obra=&co_idioma=1&colunaOrdenar=null&ordem=null`;
const searchResultBody = await fetchBody(urlResults+seachQueryParams)
const foundItemsRegex = /(\d+(?:[,.]\d+)?) Itens encontrados/is;
console.log("text found = ",`\"${foundItemsRegex.exec(searchResultBody)[0]}\"`)
const foundItems = Number(foundItemsRegex.exec(searchResultBody)[1].replace(/[,.]/, ""))
if(!foundItems){
console.log(`No items found in this search: ${foundItems}`);
return;
}
console.log(`Items found = ${foundItems}`)
const minItemNumber = 1
const sortedItemNumber = getRandomIntBetween(minItemNumber,foundItems);
console.log(`Sorted number between ${minItemNumber} and ${foundItems} = ${sortedItemNumber}`)
const searchResultBodyDirectToTheItem = await fetchBody(urlResults+seachQueryParams.replace("skip=0",`skip=${sortedItemNumber-1}`))
const itemLineRegex = new RegExp(`<tr.*(${sortedItemNumber}\\s*[.].*?'([.][.]/pesquisa/DetalheObraForm.do[^']*).*?<a[^>]*>(.*?)<\\/a>.*?)<\\/tr>`, "s");
const itemLineRegexResult = itemLineRegex.exec(searchResultBodyDirectToTheItem)
const itemRelativeUrl = itemLineRegexResult[2];
const itemTitle = itemLineRegexResult[3].trim();
console.log(`Title: ${itemTitle}`)
const itemAbsoluteUrl = "http://www.dominiopublico.gov.br/pesquisa/DetalheObraForm.do?"+itemRelativeUrl.split("?")[1]
const itemPageBody = await fetchBody(itemAbsoluteUrl);
const downloadUrlRegex = /"(DetalheObraDownload.do[^"]*)"/s
const downloadUrl = "http://www.dominiopublico.gov.br/pesquisa/"+downloadUrlRegex.exec(itemPageBody)[1]
console.log(`Link to download PDF = \n${downloadUrl}`)
}
async function fetchBody(url){
console.log(`🔻 fetching ${url}`)
const response = await fetch(url);
if(response.status != 200){
throw `[${response.status}] ${await response.text()}`
}
const buffer = await response.arrayBuffer();
const contentType = response.headers.get('content-type');
const encoding = contentType && contentType.includes('charset=')
? contentType.split('charset=')[1]
: 'utf-8';
const decoder = new TextDecoder(encoding);
return decoder.decode(buffer);
}
function getCategories(){
const categoriesStr = `
1. Administração
2. Agronomia
3. Artes
4. Biologia Geral
5. Ciência Política
6. Ciência da Computação
7. Ciência da Informação
8. Ciências da Saúde
9. Coleção Educadores
10. Comunicação
11. Defesa civil
12. Direito
13. Direitos humanos
14. Economia
15. Educação
16. Educação Física
17. Engenharias
18. Filosofia
19. Física
20. Geografia
21. História
22. Literatura
23. Literatura Infantil
24. Literatura de Cordel
25. Línguas
26. Medicina
27. Medicina Veterinária
28. Meio Ambiente
29. Meteorologia
30. Multidisciplinar
31. Música
32. Psicologia
33. Química
34. Relações Internacionais
35. Saúde Coletiva
36. Serviço Social
37. Sociologia
38. Teologia
39. Turismo
40. Teses e Dissertações
41. Trabalho
`
return new Map(categoriesStr.split("\n").filter(x=>x.includes(".")).map(x=>{
const parts = x.split(".")
const number = Number(parts[0].trim())
const category = parts[1].trim()
if(!number) throw `Not a number in category '${category}': ${parts[0]} -> ${number}`
return [number, category]
}))
}
function getRandomIntBetween(min, max) {
if(max < min) throw `invalid! max cannot be lesser than min: ${min} < ${max}`
return min + Math.floor(Math.random() * (1 + max - min));
}
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment