sulaya86 · February 22, 2022 08:16
diff --git a/downloadFilesFromWeb.py b/downloadFilesFromWeb.py
 """
 The problem:
 Download one by one and separate them into folders by country maybe not sound tedious but can be time consuming
 if you are busy enough.

 The purpose of this script is to:
 implement web scraping with "beautifulsoup" library (to get the links of the files and the country they belongs to)
 and "wget" to download the file.

 References:
    https://www.geeksforgeeks.org/implementing-web-scraping-python-beautiful-soup/
    https://www.geeksforgeeks.org/downloading-files-web-using-python/
 """
 import requests
 import os
 import wget
 from bs4 import BeautifulSoup


 URL = "https://www.taiwanexcellence.org/tw/study/ipsos"
 r = requests.get(URL)


 def remove_blank_space(string):
    return string.replace(" ", "")


 def remove_br(string):
    return string.replace("\n", "")


 if __name__ == '__main__':
    soup = BeautifulSoup(r.content, 'html5lib')
    # Remove the comments if you want to see how the content, you should do it to find out from where to
    # extract the useful information/data.
    # print(r.content)
    # print(soup.prettify())

    files = []
    translation_table = {}

    table = soup.find('section', attrs={'class': 'nelson'})

    for row in table.findAll('div', attrs={'class': 'nelson__item'}):

        folder_name = remove_br(remove_blank_space(row.h2.text))
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)

        for link in row.findAll('div', attrs={'class': 'text-xs-center'}):
            links = link.findAll('a',  attrs={'class': 'btn nelson__btn'})

            for url in links:
                linkToFile = url.get('href')
                files.append(linkToFile)
                wget.download(linkToFile, out=folder_name)
	"""
	The problem:
	Download one by one and separate them into folders by country maybe not sound tedious but can be time consuming
	if you are busy enough.

	The purpose of this script is to:
	implement web scraping with "beautifulsoup" library (to get the links of the files and the country they belongs to)
	and "wget" to download the file.

	References:
	https://www.geeksforgeeks.org/implementing-web-scraping-python-beautiful-soup/
	https://www.geeksforgeeks.org/downloading-files-web-using-python/
	"""
	import requests
	import os
	import wget
	from bs4 import BeautifulSoup


	URL = "https://www.taiwanexcellence.org/tw/study/ipsos"
	r = requests.get(URL)


	def remove_blank_space(string):
	return string.replace(" ", "")


	def remove_br(string):
	return string.replace("\n", "")


	if __name__ == '__main__':
	soup = BeautifulSoup(r.content, 'html5lib')
	# Remove the comments if you want to see how the content, you should do it to find out from where to
	# extract the useful information/data.
	# print(r.content)
	# print(soup.prettify())

	files = []
	translation_table = {}

	table = soup.find('section', attrs={'class': 'nelson'})

	for row in table.findAll('div', attrs={'class': 'nelson__item'}):

	folder_name = remove_br(remove_blank_space(row.h2.text))
	if not os.path.exists(folder_name):
	os.makedirs(folder_name)

	for link in row.findAll('div', attrs={'class': 'text-xs-center'}):
	links = link.findAll('a', attrs={'class': 'btn nelson__btn'})

	for url in links:
	linkToFile = url.get('href')
	files.append(linkToFile)
	wget.download(linkToFile, out=folder_name)