Created
October 28, 2020 14:46
-
-
Save deepwilson/c1583ceeedc4da467a29014b78043322 to your computer and use it in GitHub Desktop.
Download ArXiv papers from "awesome" repos related to DL and CV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Modified from Source -----> "https://github.com/jyguo1729/web-scraping-for-PDF-file" ''' | |
import requests | |
from bs4 import BeautifulSoup | |
def get_title(url): | |
# url = 'https://arxiv.org/abs/1108.3525' | |
html = requests.get(url) | |
soup = BeautifulSoup(html.text,'html.parser') | |
title = soup.select_one('h1.title.mathjax').text.replace('Title:', '') | |
return title | |
#import the library used to query a website | |
import urllib.request | |
from urllib.request import urlretrieve | |
from bs4 import BeautifulSoup | |
import numpy as np | |
import pandas as pd | |
from urllib.parse import urljoin | |
import os | |
import sys | |
#specify the url | |
try: | |
url=sys.argv[1] | |
except IndexError: | |
url='http://web.cs.ucla.edu/~yzsun/classes/2018Fall_CS145/schedule.html' | |
#Query the website and return the html to the variable 'page' | |
page = urllib.request.urlopen(url) | |
#Parse the html in the 'page' variable, and store it in Beautiful Soup format | |
soup = BeautifulSoup(page) | |
#print(soup.prettify()) | |
all_link=soup.find_all("a") | |
A=[] | |
B=[] | |
for link in all_link: | |
A.append(link.contents[0]) | |
B.append(urljoin(url,link['href'])) | |
df=pd.DataFrame(A,columns=['Description']) | |
df['link']=B | |
dirname = os.path.dirname(__file__) | |
#dirname="C:\py\crawler" | |
relpath='output' | |
path= os.path.join(dirname, relpath,"output.csv") | |
df.to_csv(path) | |
for link in B: | |
print('*'*80) | |
print(link) | |
continue | |
if 'arxiv' not in link: | |
continue | |
else: | |
if 'abs' in link: | |
title = get_title(link) | |
link = link.replace('abs', 'pdf')+'.pdf' | |
else: | |
link = link.replace('/pdf', '/abs') | |
link = link.replace('.pdf', '') | |
title = get_title(link) | |
link = link.replace('abs', 'pdf')+'.pdf' | |
print('download link --------------->', link) | |
file_name = title+'.pdf' #link.split('/')[-1] | |
print("Link ----> ",file_name) | |
print(link) | |
#test if link is open | |
try: | |
u=urllib.request.urlopen(link) | |
except urllib.error.URLError as e: | |
print(e.reason) | |
continue | |
#determine file name end with .pdf, skip this file otherwise | |
meta = u.info() | |
if(meta['Content-Type']!='application/pdf'): | |
print(file_name," is not a PDF file") | |
continue | |
#set abosolute path for the file | |
path_file_name = os.path.join(dirname, relpath,file_name) | |
print("path_file_name is",path_file_name) | |
#download file | |
# urlretrieve(link, path_file_name) | |
request = urllib.request.urlopen(link, timeout=500) | |
with open(path_file_name, 'wb') as f: | |
try: | |
f.write(request.read()) | |
except Exception as e: | |
print("error in download ...............", e) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment