Created
July 14, 2016 13:59
-
-
Save Aubreymcfato/a5731a719d9cc036403883de7ae6e2fc to your computer and use it in GitHub Desktop.
Python script for the Italian Wikisource: it scrapes ns0 books, getting all the metadata, and also the cover URL from the Index page.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import requests | |
from bs4 import BeautifulSoup | |
import unicodecsv | |
#uses unicodecsv for generating a CSV. I'm using python 2.7 | |
#if you use Python 3.X, go with "csv" and change the related instructions accordingly. | |
#Beware of | |
# csv.writer(open("FILE.csv"), "wt") | |
#otherwise it gives you an error. | |
# create output CSV file | |
out = unicodecsv.writer(open("listametadati.csv", "wb"), encoding="utf-8") | |
# write HEAD | |
head = ("TITLE", "TYPE", "URL", "CCE", "SUBJECT", "CREATOR", "CONTRIBUTOR", "DESCRIPTION", "PUBLISHER", "DATE", "FORMAT", "SOURCE", "LANGUAGE", "RELATION", "COVERAGE", "RIGHTS", "IMMAGINE", "IMMAGINE ED.") | |
out.writerow(head) | |
# script for retrieving the cover URL from the Index page | |
def get_cover(page, data_indice): | |
#page = wikipedia.page(book) | |
page_cover = requests.get("http://it.wikisource.org/wiki/" + data_indice) | |
soup_cover = BeautifulSoup(page_cover.text) | |
cover_line=soup_cover.find("img", { "class" : "thumbimage" }) | |
if cover_line is not None: | |
cover_semiurl = cover_line['src'] | |
cover_url = "http:" + str(cover_semiurl) | |
else: | |
cover_url = "" | |
return cover_url | |
#script for getting all the metadata. | |
#it basically scrapes them using the microformat using BeautifulSoup | |
def get_bookmetadata(book): | |
#get the page HTML | |
titolo = "http://it.wikisource.org/wiki/" + book[0] | |
print titolo | |
page = requests.get(titolo) | |
#make it a 'soup' | |
soup = BeautifulSoup(page.text) | |
data = soup.find("span", { "id" : "dati" }) | |
#find topic | |
data_argomento =data['data-argomento'] | |
#find Index URL | |
data_indice = data['data-urldellaversionecartaceaafronte'] | |
print data_indice | |
#find all the metadata | |
try: | |
titles = soup.find ('span', attrs={"id" : "ws-title"}) #se vede titoli troppo lunghi, prende quello della pagina in ns0 e non quello con l'a capo | |
except HTMLParser.HTMLParseError: | |
title = "" | |
try: | |
authors = soup.find ('span', attrs={"id" : "ws-author"}) | |
except HTMLParser.HTMLParseError: | |
author = "" | |
try: | |
publishers = soup.find ('span', attrs={"id" : "ws-publisher"}) | |
except HTMLParser.HTMLParseError: | |
publisher = "" | |
try: | |
dates = soup.find ('span', attrs={"id" : "ws-year"}) | |
except HTMLParser.HTMLParseError: | |
date = "" | |
try: | |
places = soup.find ('span', attrs={"id" : "ws-place"}) | |
except HTMLParser.HTMLParseError: | |
place = "" | |
#this was probably an attempt to generate the Index cover URL... | |
""" | |
covers=soup.find ('span', attrs={"data-urldellaversionecartaceaafronte"}) | |
try: | |
cover_url = "http:" + get_cover() | |
except HTMLParser.HTMLParseError: | |
cover_url="" | |
""" | |
if titles is not None: | |
title=titles.text#.encode('ascii', 'ignore') | |
if authors is not None: | |
author=authors.text#.encode('ascii', 'ignore') | |
if publishers is not None: | |
publisher=publishers.text#.encode('ascii', 'ignore') | |
if dates is not None: | |
date=dates.text#.encode('ascii', 'ignore') | |
if places is not None: | |
place=places.text#.encode('ascii', 'ignore') | |
if data_indice is not None: | |
#print "Abbiamo la versione testuale da qualche parte!" | |
cover_url=get_cover(page, data_indice) | |
else: | |
cover_url="" | |
out.writerow([unicode(title), "E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=it&format=epub&page=" + unicode(book[0]), None, unicode(data_argomento), unicode(author), None, None, u"Wikisource, la biblioteca libera. <it.wikisource.org>", None, u"HTML | EPUB", unicode(date) + " | " + unicode(publisher) + " | " + unicode(place), u"Italiano", u"http://it.wikisource.org/wiki/"+ unicode(book[0]), None, u"Pubblico dominio", unicode(cover_url), u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"]) | |
#I read the books from an input CSV. | |
#of course every file is OK, but the important data is the ns0 title | |
books=unicodecsv.reader(open("listalibri.csv"), encoding="utf-8") | |
for book in books: | |
get_bookmetadata(book) | |
print "everything is ok" | |
""" | |
it is probably possible to write rewrite everything using Pywikibot and parsing the templates... | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment