Skip to content

Instantly share code, notes, and snippets.

@dbeley
Created August 24, 2019 17:19
Show Gist options
  • Save dbeley/bea8b318bdd614bbbd26ed535ef8ce9a to your computer and use it in GitHub Desktop.
Save dbeley/bea8b318bdd614bbbd26ed535ef8ce9a to your computer and use it in GitHub Desktop.
import requests
import pandas as pd
from bs4 import BeautifulSoup
url_base = "https://en.wikipedia.org/wiki/Napoleon"
soup_base = BeautifulSoup(requests.get(url_base).content, "lxml")
links = soup_base.find("div", {"class": "navbox"}).find_all("li")
list = []
for index, link in enumerate(links, 1):
try:
dict_battle = {}
lien = f"https://en.wikipedia.org{link.find('a')['href']}"
# print(lien)
soup_link = BeautifulSoup(requests.get(lien).content, "lxml")
try:
dict_battle["Name EN"] = soup_link.find(
"h1", {"class": "firstHeading"}
).text
text = [
t.findNext("td").text.strip() for t in soup_link.find_all(text="Result")
]
# print(f"Url {lien}, {text}")
dict_battle["Result EN"] = text
dict_battle["URL EN"] = lien
try:
lien_fr = soup_link.find("li", {"class": "interwiki-fr"}).find("a")[
"href"
]
soup_link_fr = BeautifulSoup(requests.get(lien_fr).content, "lxml")
dict_battle["Name FR"] = soup_link_fr.find(
"h1", {"class": "firstHeading"}
).text
text_fr = [
t.findNext("td").text.strip()
for t in soup_link_fr.find_all("th")
if "Issue" in t.text
]
# print(f"Url {lien_fr}, {text_fr}")
dict_battle["Result FR"] = text_fr
dict_battle["URL FR"] = lien_fr
except Exception as e:
print(f"Lien français non trouvé : {e}")
except Exception as e:
print(f"Balise non trouvée : {e}")
list.append(dict_battle)
print(f"{index}, {dict_battle}")
except Exception as e:
print(f"Pas possible de scraper : {e}")
df = pd.DataFrame(list)
df.to_csv("results.csv", sep="\t")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment