Last active
June 24, 2019 08:48
-
-
Save scratchmex/63027c61278630b8c57bda5ad425d843 to your computer and use it in GitHub Desktop.
Search and download top chart songs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import urllib2, pickle, re | |
def read_url_soup(url): | |
return BeautifulSoup(urllib2.urlopen(url), 'html.parser') | |
####.read() | |
def save_pickle(data, filename, open_method='r'): | |
try: | |
with open(filename, open_method) as f: | |
pickle.dump(data, f) | |
except Exception as e: | |
print e | |
exit() | |
else: | |
print 'Saved in', filename | |
def load_pickle(filename): | |
print 'Trying to load', filename | |
try: | |
with open(filename) as f: | |
print 'Loaded', filename | |
return pickle.load(f) | |
except Exception as e: | |
print e | |
print 'Not loaded' | |
return None | |
else: | |
print filename, 'loaded' | |
def store_new_songs(data, pickle_filename): | |
global pickle_data | |
if not not_load_save: | |
pickle_data = load_pickle(pickle_filename) | |
if not pickle_data: | |
pickle_data = [] | |
try: | |
pickle_data.extend(list(filter(lambda x: x not in pickle_data, data))) | |
except Exception as e: | |
print e | |
exit() | |
else: | |
if not not_load_save: | |
save_pickle(pickle_data, pickle_filename, 'w') | |
def crawl_songs(url, song_tag_class, title_tag_class, artist_tag_class): | |
if (type(song_tag_class) or type(title_tag_class) or type(artist_tag_class)) is not list: | |
print song_tag_class, title_tag_class, artist_tag_class | |
raise Exception('Need song, title and artist to be in [tag, class] form') | |
global soup | |
soup = read_url_soup(url) | |
a = soup.find_all(song_tag_class[0], song_tag_class[1]) | |
def soup_find(soup, tag, obj_class): | |
match = soup.find(tag, obj_class) | |
if match: | |
return match.get_text().strip().encode('utf-8') | |
else: | |
print soup | |
print '<{} {}> None? {}'.format(tag, obj_class, match) | |
return [[soup_find(x, title_tag_class[0], title_tag_class[1]), soup_find(x, artist_tag_class[0], artist_tag_class[1])] for x in a] | |
def get_next_url(url, next_url_tag_class): | |
global soup | |
if not soup: | |
soup = read_url_soup(url) | |
next_url = soup.find(next_url_tag_class[0], next_url_tag_class[1]) | |
if next_url: | |
next_url = next_url.get('href') | |
elif not next_url: | |
print 'No next url' | |
return None | |
url = re.match('(http[s]?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*?\/', url).group(0) | |
return url + next_url[1:] | |
def crawl_website(url, song_tag_class, title_tag_class, artist_tag_class, next_url_tag_class, pickle_filename, recursion_times=5): | |
if type(next_url_tag_class) is not list: | |
raise Exception('Need url to be in [tag, class] form') | |
if recursion_times < 0: | |
raise Exception('recursion_times must be positive and its {}'.format(recursion_times)) | |
global not_load_save | |
not_load_save = True | |
global pickle_data | |
pickle_data = load_pickle(pickle_filename) | |
#print pickle_data | |
#exit() | |
for iter in range(recursion_times): | |
print 'Iter', iter+1 | |
store_new_songs(crawl_songs(url, song_tag_class, title_tag_class, artist_tag_class), pickle_filename) | |
if iter+1 != recursion_times: | |
print 'Getting next url...' | |
url = get_next_url(url, next_url_tag_class) | |
print url | |
print 'Finished storing new songs, current lenght {}\n'.format(len(pickle_data)) | |
if not url: | |
print 'Ran out of urls!' | |
break | |
print 'Saving {} songs'.format(len(pickle_data)) | |
save_pickle(pickle_data, pickle_filename, 'w') | |
print '|Done' | |
if __name__ == '__main__': | |
## options = { | |
## 'url': 'http://www.officialcharts.com/charts/singles-chart/', | |
## 'song_tag_class': ['div', 'track'], | |
## 'title_tag_class': ['div', 'title'], | |
## 'artist_tag_class': ['div', 'artist'], | |
## 'next_url_tag_class': ['a', 'chart-date-directions'], | |
## 'pickle_filename': 'songs.pickletest', | |
## 'iterations': 1 | |
## } | |
options = { | |
'url': 'http://www.billboard.com/charts/dance-electronic-songs', | |
'song_tag_class': ['div', {'class': 'chart-row__container'}], | |
'title_tag_class': ['h2', {'class': 'chart-row__song'}], | |
'artist_tag_class': [['a','h3'], {'class': 'chart-row__artist'}], | |
'next_url_tag_class': ['a', {'class': 'chart-nav__link', 'data-tracklabel': "Week-previous"}], | |
'pickle_filename': 'songs2.pickle', | |
'iterations': 500 | |
} | |
print 'All comment out' | |
## data = load_pickle(options['pickle_filename']) | |
## for i in data: | |
## print i[0], '-', i[1] | |
## print len(data) | |
crawl_website(options['url'], options['song_tag_class'], options['title_tag_class'], | |
options['artist_tag_class'],options['next_url_tag_class'], options['pickle_filename'], | |
options['iterations']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import sys | |
from crawler import load_pickle | |
data = load_pickle(sys.argv[1]) | |
for i in data: | |
print i[0], '-', i[1] | |
print len(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment