-
-
Save EdmundMartin/0230e90cbd4e0790afdd22318fa0ad8b to your computer and use it in GitHub Desktop.
| import requests | |
| from bs4 import BeautifulSoup | |
| import time | |
| USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} | |
| def fetch_results(search_term, number_results, language_code): | |
| assert isinstance(search_term, str), 'Search term must be a string' | |
| assert isinstance(number_results, int), 'Number of results must be an integer' | |
| escaped_search_term = search_term.replace(' ', '+') | |
| google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results, language_code) | |
| response = requests.get(google_url, headers=USER_AGENT) | |
| response.raise_for_status() | |
| return search_term, response.text | |
| def parse_results(html, keyword): | |
| soup = BeautifulSoup(html, 'html.parser') | |
| found_results = [] | |
| rank = 1 | |
| result_block = soup.find_all('div', attrs={'class': 'g'}) | |
| for result in result_block: | |
| link = result.find('a', href=True) | |
| title = result.find('h3', attrs={'class': 'r'}) | |
| description = result.find('span', attrs={'class': 'st'}) | |
| if link and title: | |
| link = link['href'] | |
| title = title.get_text() | |
| if description: | |
| description = description.get_text() | |
| if link != '#': | |
| found_results.append({'keyword': keyword, 'rank': rank, 'title': title, 'description': description, 'link': link}) | |
| rank += 1 | |
| return found_results | |
| def scrape_google(search_term, number_results, language_code): | |
| try: | |
| keyword, html = fetch_results(search_term, number_results, language_code) | |
| results = parse_results(html, keyword) | |
| return results | |
| except AssertionError: | |
| raise Exception("Incorrect arguments parsed to function") | |
| except requests.HTTPError: | |
| raise Exception("You appear to have been blocked by Google") | |
| except requests.RequestException: | |
| raise Exception("Appears to be an issue with your connection") | |
| if __name__ == '__main__': | |
| keywords = ['edmund martin', 'python', 'google scraping'] | |
| data = [] | |
| for keyword in keywords: | |
| try: | |
| results = scrape_google(keyword, 100, "en") | |
| for result in results: | |
| data.append(result) | |
| except Exception as e: | |
| print(e) | |
| finally: | |
| time.sleep(10) | |
| print(data) |
Hello how i can add these:
metatags = soup.find_all('meta',attrs={'name':'generator'})
When i try to use it for the keyword "Commbank" i get this error:
'NoneType' object has no attribute 'get_text'
On line 36 don't you want to include the link in the dictionary you append to found_results?
@rajrsingh Thanks for pointing this out. I have made the changes.
@cabbage-dealer The code on the blog was updated to fix this error. This happens when a result doesn't have a standard description. I have also updated the code to avoid this issue.
I keep getting all elements printed as in reversed (link ,description, title, tank, keyword) instead of (keyword, rank, title, description, link), sometimes it's not on order. I Google this and it's because the elements are contained in a set, rather than a list.
Is there a solution to print out elements in specific order?
How can i get results for different countries?
Hi when I write this code into spyder I get the result as [ ].
What does this means.
Can you please help me out Iam new into this domain
@EdmundMartin
the result
result_block = soup.find_all('div', attrs={'class': 'g'})
return emty [], so what was i mistake, can you help
Google has updated it's selectors, you just need to change this selector a bit.
@EdmundMartin thanks for this. I am getting the result returning [] still. I see you mentioned that Google has updated it's selectors. What have they updated to?
I am very new to this, programming and all, but I like the idea of this project and I am trying to expand on it. Anyways, when I ran this the first time, it worked. However, running it multiple times gave me the response that I've been blocked by google. Is that because I need to use Google's API to scrap a search? Thanks for your help!
Have you seen an increase in blocking based on language requesting?