Last active
August 29, 2015 14:16
-
-
Save jonatasleon/2b1007b1816c3a3c2353 to your computer and use it in GitHub Desktop.
Recolhe informações diretamente do site do IBGE. Para acessar alguma cidade, o arquivo cities.txt de conter o estado e a cidade na seguite forma: sao-paulo|guaratingueta ou acre|acrelandia, onde cada cidade deve estar em uma linha. Adaptado a partir de https://unknownsec.wordpress.com/2014/10/09/coleta-de-dados-do-ibge-python-beatifulsoap-e-urll…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding:utf-8 -*- | |
import urllib2 | |
from collections import Iterable | |
from bs4 import BeautifulSoup | |
URL_HOME = 'http://cidades.ibge.gov.br/xtras/home.php' | |
URL_UF = 'http://cidades.ibge.gov.br/xtras/' | |
URL_MUN = 'http://cidades.ibge.gov.br/xtras/' | |
FILE_CITIES = 'cities.txt' | |
def getCities(): | |
f = open(FILE_CITIES, 'a+') | |
lst = [] | |
for line in f: | |
lst.append(line.splitlines()[0]) | |
f.close() | |
return lst | |
def searchData(cities): | |
if not cities: return ['Nenhuma cidade buscada'] | |
html = urllib2.urlopen(URL_HOME).read() | |
soup = BeautifulSoup(html) | |
result = [] | |
for link in soup.find_all('a'): | |
pagina = link.get('href') | |
if isinstance(pagina, Iterable) and "uf.php" in pagina: | |
paginaUf = pagina.split('xtras/')[1] | |
urlUf = URL_UF+paginaUf | |
htmlUf = urllib2.urlopen(urlUf).read() | |
soup = BeautifulSoup(htmlUf) | |
for links in soup.find_all('a'): | |
paginaMun = links.get('href') | |
try: | |
if "perfil.php" in paginaMun and "/estadosat/" not in paginaMun: | |
for city in cities: | |
if not city == "" and city in paginaMun: | |
urlMun = URL_MUN+paginaMun | |
htmlMun = urllib2.urlopen(urlMun).read() | |
soupMun = BeautifulSoup(htmlMun) | |
pop2014 = soupMun.find_all('td', {'class': 'valor'})[0].get_text() | |
pop2010 = soupMun.find_all('td', {'class': 'valor'})[1].get_text() | |
prefeito = soupMun.find_all('td',{'class': 'valor'})[6].get_text() | |
mun = soupMun.find('title').get_text().split("|")[3].strip() | |
cod = soupMun.find_all('td', {'class': 'valor'})[4].get_text() | |
dens = soupMun.find_all('td', {'class': 'valor'})[3].get_text() | |
area = soupMun.find_all('td', {'class': 'valor'})[2].get_text() | |
found = (cod, pop2014, pop2010, prefeito, mun, area, dens) | |
result.append(found) | |
except: | |
pass | |
return result | |
cities = getCities() | |
for city in searchData(cities): | |
print "Nome do município: ", city[4] | |
print "Código: ", city[0] | |
print "População 2014: ", city[1] | |
print "População 2010: ", city[2] | |
print "Área da unidade territorial(km²): ", city[5] | |
print "Densidade demográfica(hab/km²): ", city[6] | |
print "Prefeito : ", city[3] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment