deepwilson · October 28, 2020 14:46
diff --git a/Scrape "awesome repos" for ArXiv papers to send to Kindle b/Scrape "awesome repos" for ArXiv papers to send to Kindle
 '''Modified from Source -----> "https://github.com/jyguo1729/web-scraping-for-PDF-file" '''
 import requests 
 from bs4 import BeautifulSoup

 def get_title(url):
    # url = 'https://arxiv.org/abs/1108.3525'
    html = requests.get(url)
    soup = BeautifulSoup(html.text,'html.parser')
    title = soup.select_one('h1.title.mathjax').text.replace('Title:', '')
    return title

 #import the library used to query a website
 import urllib.request
 from urllib.request import urlretrieve
 from bs4 import BeautifulSoup
 import numpy as np
 import pandas as pd
 from urllib.parse import urljoin
 import os
 import sys




 #specify the url
 try:
    url=sys.argv[1]
 except IndexError:
    url='http://web.cs.ucla.edu/~yzsun/classes/2018Fall_CS145/schedule.html'
    
 #Query the website and return the html to the variable 'page'
 page = urllib.request.urlopen(url)


 #Parse the html in the 'page' variable, and store it in Beautiful Soup format
 soup = BeautifulSoup(page)
 #print(soup.prettify())

 all_link=soup.find_all("a") 
 A=[]
 B=[]
 for link in all_link:
    A.append(link.contents[0])
    B.append(urljoin(url,link['href']))

 df=pd.DataFrame(A,columns=['Description'])
 df['link']=B


 dirname = os.path.dirname(__file__)
 #dirname="C:\py\crawler"
 relpath='output'
 path= os.path.join(dirname, relpath,"output.csv")
 df.to_csv(path)


 for link in B:
    print('*'*80)
    print(link)
    continue
    if 'arxiv' not in link:
        continue
    else:
        if 'abs' in link:
            title = get_title(link)
            link = link.replace('abs', 'pdf')+'.pdf'
        else:
            link = link.replace('/pdf', '/abs')
            link = link.replace('.pdf', '')
            title = get_title(link)
            link = link.replace('abs', 'pdf')+'.pdf'

        print('download link --------------->', link)
        file_name = title+'.pdf' #link.split('/')[-1]
        print("Link ----> ",file_name)
        print(link)

    #test if link is open
    try: 
        u=urllib.request.urlopen(link)
    except urllib.error.URLError as e:
        print(e.reason)
        continue
    
    #determine file name end with .pdf, skip this file otherwise

    


    meta = u.info()
    if(meta['Content-Type']!='application/pdf'):
        print(file_name," is not a PDF file")
        continue
        
    #set abosolute path for the file
    path_file_name = os.path.join(dirname, relpath,file_name)
    print("path_file_name is",path_file_name)
        
    #download file  
    # urlretrieve(link, path_file_name)

    request = urllib.request.urlopen(link, timeout=500)
    with open(path_file_name, 'wb') as f:
        try:
            f.write(request.read())
        except Exception as e:
            print("error in download ...............", e)
	'''Modified from Source -----> "https://github.com/jyguo1729/web-scraping-for-PDF-file" '''
	import requests
	from bs4 import BeautifulSoup

	def get_title(url):
	# url = 'https://arxiv.org/abs/1108.3525'
	html = requests.get(url)
	soup = BeautifulSoup(html.text,'html.parser')
	title = soup.select_one('h1.title.mathjax').text.replace('Title:', '')
	return title

	#import the library used to query a website
	import urllib.request
	from urllib.request import urlretrieve
	from bs4 import BeautifulSoup
	import numpy as np
	import pandas as pd
	from urllib.parse import urljoin
	import os
	import sys




	#specify the url
	try:
	url=sys.argv[1]
	except IndexError:
	url='http://web.cs.ucla.edu/~yzsun/classes/2018Fall_CS145/schedule.html'

	#Query the website and return the html to the variable 'page'
	page = urllib.request.urlopen(url)


	#Parse the html in the 'page' variable, and store it in Beautiful Soup format
	soup = BeautifulSoup(page)
	#print(soup.prettify())

	all_link=soup.find_all("a")
	A=[]
	B=[]
	for link in all_link:
	A.append(link.contents[0])
	B.append(urljoin(url,link['href']))

	df=pd.DataFrame(A,columns=['Description'])
	df['link']=B


	dirname = os.path.dirname(__file__)
	#dirname="C:\py\crawler"
	relpath='output'
	path= os.path.join(dirname, relpath,"output.csv")
	df.to_csv(path)


	for link in B:
	print(''80)
	print(link)
	continue
	if 'arxiv' not in link:
	continue
	else:
	if 'abs' in link:
	title = get_title(link)
	link = link.replace('abs', 'pdf')+'.pdf'
	else:
	link = link.replace('/pdf', '/abs')
	link = link.replace('.pdf', '')
	title = get_title(link)
	link = link.replace('abs', 'pdf')+'.pdf'

	print('download link --------------->', link)
	file_name = title+'.pdf' #link.split('/')[-1]
	print("Link ----> ",file_name)
	print(link)

	#test if link is open
	try:
	u=urllib.request.urlopen(link)
	except urllib.error.URLError as e:
	print(e.reason)
	continue

	#determine file name end with .pdf, skip this file otherwise




	meta = u.info()
	if(meta['Content-Type']!='application/pdf'):
	print(file_name," is not a PDF file")
	continue

	#set abosolute path for the file
	path_file_name = os.path.join(dirname, relpath,file_name)
	print("path_file_name is",path_file_name)

	#download file
	# urlretrieve(link, path_file_name)

	request = urllib.request.urlopen(link, timeout=500)
	with open(path_file_name, 'wb') as f:
	try:
	f.write(request.read())
	except Exception as e:
	print("error in download ...............", e)