Last active
July 28, 2017 07:10
-
-
Save suriyadeepan/60dcb03e437293b8d8cf6755fe10a3c3 to your computer and use it in GitHub Desktop.
Scrape from tamilvu.org
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Extract images from pdf. | |
- requires imagemagick and wand | |
sudo apt install imagemagick | |
sudo pip3 install --upgrade Wand | |
''' | |
from wand.image import Image | |
import sys | |
from utils import * | |
def pdf2im(filepath, resolution=300): | |
# extract path and "filename" | |
filename = filepath.split('/')[-1].split('.')[0] | |
path = '/'.join(filepath.split('/')[:-1]) + '/' + filename + '/' | |
# create such path | |
create_folder(path) | |
with(Image(filename=filepath, resolution=resolution)) as src: | |
images = src.sequence | |
pages = len(images) | |
for i in range(pages): | |
Image(images[i]).save(filename=path + str(i) + '.png') | |
if __name__ == '__main__': | |
# a sample pdf | |
pdf2im('./tamilvu/music/cpajaneikkiirttneikalhiva.pdf') | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4==4.6.0 | |
bs4==0.0.1 | |
certifi==2017.4.17 | |
chardet==3.0.4 | |
idna==2.5 | |
lxml==3.8.0 | |
requests==2.18.2 | |
urllib3==1.22 | |
Wand==0.4.4 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
from utils import * | |
BASE = 'http://tamilvu.org/library/nationalized/scholars' | |
seed_urls = [ 'http://tamilvu.org/library/nationalized/scholars/html/music.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/literature.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/education.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/poetry.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/law.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/sociology.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/biography.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/drama.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/general.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/materialscience.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/religion.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/language.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/history.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/agriculture.htm', | |
'http://tamilvu.org/library/nationalized/scholars/html/others.htm' | |
] | |
def get_soup(url): | |
return BeautifulSoup(requests.get(url).content, 'lxml') | |
def decorate_link(url): | |
return BASE + url[2:] | |
def get_links(url): | |
soup = get_soup(url) | |
links = [] | |
for link in soup.find_all('a'): | |
if 'href' in link.attrs: | |
href = link.get('href') | |
if 'pdf' in str(href): | |
links.append(decorate_link(href)) | |
return links | |
def download_file(url, PATH='./'): | |
filename = url.split('/')[-1] | |
# get response handle | |
response = requests.get(url) | |
# write to file | |
with open(PATH + filename, 'wb') as f: | |
f.write(response.content) | |
if __name__ == '__main__': | |
# create a folder to save files | |
create_folder('./tamilvu') | |
# iterate through seed urls | |
links, link2folder = [], {} | |
n_urls = len(seed_urls) | |
print(':: Gathering links ::') | |
for i,url in enumerate(seed_urls): | |
# get links to pdf's in each url | |
print(' [{}/{}] {}'.format(i, n_urls, url)) | |
clinks = get_links(url) | |
# create sub-folder | |
subfolder = './tamilvu' + '/' + url.split('/')[-1].split('.')[0] + '/' | |
create_folder(subfolder) | |
# attach folders to links | |
for link in clinks: | |
link2folder[link] = subfolder | |
# append to list of links | |
links.extend(clinks) | |
# save links to a file | |
# just in case, the python downloader fucks up! | |
# use wget or aria2 | |
save2file(links) | |
n_links = len(links) | |
print(':: Downloading files ::') | |
# iterate through links | |
for i,link in enumerate(links): | |
# download the damn thing | |
print(' [{}/{}] {}'.format(i, n_links, link)) | |
download_file(link, link2folder[link]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
def save2file(items, filename='items.list'): | |
with open(filename, 'w') as f: | |
for item in items: | |
f.write(str(item) + '\n') | |
def create_folder(name): | |
if not os.path.exists(name): | |
os.makedirs(name) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
py3 requirements
beautifulsoup4==4.6.0
bs4==0.0.1
certifi==2017.4.17
chardet==3.0.4
idna==2.5
lxml==3.8.0
requests==2.18.2
urllib3==1.22
Wand==0.4.4