Last active
August 29, 2015 13:56
-
-
Save mekler/8839295 to your computer and use it in GitHub Desktop.
Código utilizado para pasar archivos del WEF a csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
################################################################################################ | |
#Autor: mekler | |
# | |
#Instrucciones de instalacion | |
#sudo apt-get php5-cli | |
# | |
#ĺinea de comando | |
# python wef-hack_2008.py 5626281-Financial-Development-Report-2008/ $(ls -1 5626281-Financial-Development-Report-2008/ | grep .pdf$) | |
# php scrape_folders.php 5626281-Financial-Development-Report-2008/ $(ls -1 5626281-Financial-Development-Report-2008/ | grep ^pg.*\.csv$) | |
# | |
################################################################################################ | |
*/ | |
$path = $argv[1]; | |
$n = count($argv); | |
for( $i=2; $i<$n; $i++){ | |
$fileDesc = pathinfo($argv[$i]); | |
if( isset($fileDesc['extension']) && $fileDesc['extension']=='csv'){ | |
echo "cat ".$path.$argv[$i]."| sed 's/[0-9] /&|/g' | sed 's/n\/a /&|/g' | sed 's/\.\{2,\}/|/g' > ".$path."salida_".$fileDesc['basename']."\n"; | |
exec("cat ".$path.$argv[$i]."| sed 's/[0-9] /&|/g' | sed 's/n\/a /&|/g' | sed 's/\.\{2,\}/|/g' > ".$path."salida_".$fileDesc['basename']); | |
} | |
} | |
?> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf-8 | |
################################################################################################ | |
#Autor: mekler | |
# | |
#Instrucciones de instalacion | |
#sudo apt-get install python-pip | |
#sudo pip install scraperwiki | |
# | |
#ĺinea de comando | |
# python wef-hack_2008.py 5626281-Financial-Development-Report-2008/ $(ls -1 5626281-Financial-Development-Report-2008/ | grep .pdf$) | |
# php scrape_folders.php 5626281-Financial-Development-Report-2008/ $(ls -1 5626281-Financial-Development-Report-2008/ | grep ^pg.*\.csv$) | |
################################################################################################ | |
import sys, re | |
import scraperwiki, urllib | |
argumentos = len(sys.argv) | |
def guarda_archivo(archivo_salida,izquierdo,derecho): | |
import csv | |
f=open(archivo_salida,"wb") #open file for writing | |
w=csv.writer(f, delimiter='|', | |
quotechar='"', quoting=csv.QUOTE_MINIMAL) #open a csv writer | |
n=len(izquierdo) | |
if len(derecho) >0: | |
if len(derecho)>len(izquierdo): | |
for i in range (0,len(derecho)): | |
if i < len(izquierdo): | |
texto = izquierdo[i] | |
else: | |
texto = "1 No Encontrado ........... 1" | |
w.writerow([texto, derecho[i] ])#write each row | |
else: | |
for i in range (0,n): | |
if i<len(derecho): | |
texto = derecho[i] | |
else: | |
texto = "1 No Encontrado ........... 1" | |
w.writerow([izquierdo[i], texto ])#write each row | |
else: | |
for i in range (0,n): | |
w.writerow([izquierdo[i]])#write each row | |
f.close() | |
def busca_datos(texto,i,elementos): | |
m = re.match(r'([\d]{1,2}|n\/a)\s+([a-zA-ZáÁÉéíÍóÓúÚñÑ\s]+|Korea, Rep. ?)$',texto) | |
if m is not None: | |
print m.group(0) | |
flag = True | |
while flag: | |
#for elemento in range(i,len(elementos)): | |
i = i+1 | |
k = re.match(r'[\.]+',elementos[i].text.strip().encode('utf-8')) | |
if k is not None: | |
#print k.group(0) + " " + str(i) | |
flag = False | |
#print m.group(0)+ " "+ elementos[i].text.strip().encode('utf-8') | |
return [True, i, m.group(0)+ " "+ elementos[i].text.strip().encode('utf-8')] | |
else: | |
return [False] | |
def encuentra_contenido(url, archivo_salida): | |
u = urllib.urlopen(url) | |
x=scraperwiki.pdftoxml( u.read()) # interpret it as xml | |
print x # let's see what's in there abbreviated... | |
import lxml | |
r=lxml.etree.fromstring(x) | |
r.xpath('//page[@number="1"]') | |
todo = r.xpath('//text[@height="10" and @font="4"]') | |
print len(todo) | |
if len(todo)<40: | |
todo = r.xpath('//text[@height="10" and @font="1"]') | |
derecho = list() | |
izquierdo = list() | |
n = len(todo) | |
elemento = 0; | |
while elemento < n: | |
texto_procesado = busca_datos(todo[elemento].text.strip().encode('utf-8'),elemento,todo) | |
if len(texto_procesado)>1: | |
curado = texto_procesado[2] | |
elemento = texto_procesado[1] | |
else: | |
curado = todo[elemento].text.strip().encode('utf-8') | |
print curado + " "+ str(elemento) | |
if int(todo[elemento].get('left')) > 200: | |
derecho.append(curado) | |
else: | |
#busca_datos(todo[elemento].text.strip().encode('utf-8'),elemento,todo) | |
izquierdo.append(curado) | |
elemento = elemento +1 | |
guarda_archivo(archivo_salida,izquierdo,derecho) | |
sys.exit | |
for i in range (2,argumentos): | |
url="file://localhost/home/mekler/Downloads/WEF_Financial_Development_Report_2008_2012/"+sys.argv[1]+sys.argv[i] | |
archivo_salida=sys.argv[1]+sys.argv[i]+".csv" | |
encuentra_contenido(url,archivo_salida) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Para que puedan probar:
https://www.dropbox.com/s/38nuil6a2vnk6s3/5626281-Financial-Development-Report-2008.tar.gz