Last active
May 7, 2025 02:08
-
-
Save railson-ferreira/69850dfafdc638446c598c4d91b12edc to your computer and use it in GitHub Desktop.
dominiopublico.gov.br research publications randomizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| async function main() { | |
| const urlPesquisa = "http://www.dominiopublico.gov.br/pesquisa/PesquisaObraForm.do" | |
| const homeBody = await fetchBody(urlPesquisa+"?co_midia=2"); | |
| const regexSelect = /<select name="co_categoria"[^>]*>(.*?)<\/select>/s; | |
| const categories = getCategories() | |
| const categoriesToIdMap = new Map() | |
| const regexValue = /value="(\d*)"/s | |
| const fetchedCategories = regexSelect.exec(homeBody)[1].split("</option>").map(x=>{ | |
| const id = regexValue.exec(x)?.[1] | |
| return [id,x.replace(/<.*>/,"").trim()]; | |
| }) | |
| fetchedCategories.forEach(fechedCategory=>{ | |
| const fechedCategoryName = fechedCategory[1] | |
| const fechedCategoryId = fechedCategory[0] | |
| for (const value of categories.values()) { | |
| if (value === fechedCategoryName) { | |
| categoriesToIdMap.set(fechedCategoryName,fechedCategoryId) | |
| return; | |
| } | |
| } | |
| }) | |
| if(!categories.size || !categoriesToIdMap.size){ | |
| throw `empty categories! list A = ${categories.size}; list B = ${categoriesToIdMap.size}` | |
| } | |
| if(categories.size !== categoriesToIdMap.size){ | |
| throw `categories size does not match! list A = ${categories.size}; list B = ${categoriesToIdMap.size}` | |
| } | |
| const minCategoryNumber = 1 | |
| const sortedCategoryNumber = getRandomIntBetween(minCategoryNumber,categories.size); | |
| const categoryName = categories.get(sortedCategoryNumber); | |
| const categoryId = categoriesToIdMap.get(categoryName) | |
| console.log(`Sorted number between ${minCategoryNumber} and ${categories.size} = ${sortedCategoryNumber}`) | |
| console.log(`Category Name: ${categories.get(sortedCategoryNumber)}`) | |
| console.log(`Category Id: ${categoryId}`) | |
| const urlResults = "http://www.dominiopublico.gov.br/pesquisa/ResultadoPesquisaObraForm.do" | |
| const seachQueryParams = `?first=50&skip=0&ds_titulo=&co_autor=&no_autor=&co_categoria=${categoryId}&pagina=1&select_action=Submit&co_midia=2&co_obra=&co_idioma=1&colunaOrdenar=null&ordem=null`; | |
| const searchResultBody = await fetchBody(urlResults+seachQueryParams) | |
| const foundItemsRegex = /(\d+(?:[,.]\d+)?) Itens encontrados/is; | |
| console.log("text found = ",`\"${foundItemsRegex.exec(searchResultBody)[0]}\"`) | |
| const foundItems = Number(foundItemsRegex.exec(searchResultBody)[1].replace(/[,.]/, "")) | |
| if(!foundItems){ | |
| console.log(`No items found in this search: ${foundItems}`); | |
| return; | |
| } | |
| console.log(`Items found = ${foundItems}`) | |
| const minItemNumber = 1 | |
| const sortedItemNumber = getRandomIntBetween(minItemNumber,foundItems); | |
| console.log(`Sorted number between ${minItemNumber} and ${foundItems} = ${sortedItemNumber}`) | |
| const searchResultBodyDirectToTheItem = await fetchBody(urlResults+seachQueryParams.replace("skip=0",`skip=${sortedItemNumber-1}`)) | |
| const itemLineRegex = new RegExp(`<tr.*(${sortedItemNumber}\\s*[.].*?'([.][.]/pesquisa/DetalheObraForm.do[^']*).*?<a[^>]*>(.*?)<\\/a>.*?)<\\/tr>`, "s"); | |
| const itemLineRegexResult = itemLineRegex.exec(searchResultBodyDirectToTheItem) | |
| const itemRelativeUrl = itemLineRegexResult[2]; | |
| const itemTitle = itemLineRegexResult[3].trim(); | |
| console.log(`Title: ${itemTitle}`) | |
| const itemAbsoluteUrl = "http://www.dominiopublico.gov.br/pesquisa/DetalheObraForm.do?"+itemRelativeUrl.split("?")[1] | |
| const itemPageBody = await fetchBody(itemAbsoluteUrl); | |
| const downloadUrlRegex = /"(DetalheObraDownload.do[^"]*)"/s | |
| const downloadUrl = "http://www.dominiopublico.gov.br/pesquisa/"+downloadUrlRegex.exec(itemPageBody)[1] | |
| console.log(`Link to download PDF = \n${downloadUrl}`) | |
| } | |
| async function fetchBody(url){ | |
| console.log(`🔻 fetching ${url}`) | |
| const response = await fetch(url); | |
| if(response.status != 200){ | |
| throw `[${response.status}] ${await response.text()}` | |
| } | |
| const buffer = await response.arrayBuffer(); | |
| const contentType = response.headers.get('content-type'); | |
| const encoding = contentType && contentType.includes('charset=') | |
| ? contentType.split('charset=')[1] | |
| : 'utf-8'; | |
| const decoder = new TextDecoder(encoding); | |
| return decoder.decode(buffer); | |
| } | |
| function getCategories(){ | |
| const categoriesStr = ` | |
| 1. Administração | |
| 2. Agronomia | |
| 3. Artes | |
| 4. Biologia Geral | |
| 5. Ciência Política | |
| 6. Ciência da Computação | |
| 7. Ciência da Informação | |
| 8. Ciências da Saúde | |
| 9. Coleção Educadores | |
| 10. Comunicação | |
| 11. Defesa civil | |
| 12. Direito | |
| 13. Direitos humanos | |
| 14. Economia | |
| 15. Educação | |
| 16. Educação Física | |
| 17. Engenharias | |
| 18. Filosofia | |
| 19. Física | |
| 20. Geografia | |
| 21. História | |
| 22. Literatura | |
| 23. Literatura Infantil | |
| 24. Literatura de Cordel | |
| 25. Línguas | |
| 26. Medicina | |
| 27. Medicina Veterinária | |
| 28. Meio Ambiente | |
| 29. Meteorologia | |
| 30. Multidisciplinar | |
| 31. Música | |
| 32. Psicologia | |
| 33. Química | |
| 34. Relações Internacionais | |
| 35. Saúde Coletiva | |
| 36. Serviço Social | |
| 37. Sociologia | |
| 38. Teologia | |
| 39. Turismo | |
| 40. Teses e Dissertações | |
| 41. Trabalho | |
| ` | |
| return new Map(categoriesStr.split("\n").filter(x=>x.includes(".")).map(x=>{ | |
| const parts = x.split(".") | |
| const number = Number(parts[0].trim()) | |
| const category = parts[1].trim() | |
| if(!number) throw `Not a number in category '${category}': ${parts[0]} -> ${number}` | |
| return [number, category] | |
| })) | |
| } | |
| function getRandomIntBetween(min, max) { | |
| if(max < min) throw `invalid! max cannot be lesser than min: ${min} < ${max}` | |
| return min + Math.floor(Math.random() * (1 + max - min)); | |
| } | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment