Skip to content

Instantly share code, notes, and snippets.

@scratchmex
Last active June 24, 2019 08:48
Show Gist options
  • Save scratchmex/63027c61278630b8c57bda5ad425d843 to your computer and use it in GitHub Desktop.
Save scratchmex/63027c61278630b8c57bda5ad425d843 to your computer and use it in GitHub Desktop.
Search and download top chart songs
from bs4 import BeautifulSoup
import urllib2, pickle, re
def read_url_soup(url):
return BeautifulSoup(urllib2.urlopen(url), 'html.parser')
####.read()
def save_pickle(data, filename, open_method='r'):
try:
with open(filename, open_method) as f:
pickle.dump(data, f)
except Exception as e:
print e
exit()
else:
print 'Saved in', filename
def load_pickle(filename):
print 'Trying to load', filename
try:
with open(filename) as f:
print 'Loaded', filename
return pickle.load(f)
except Exception as e:
print e
print 'Not loaded'
return None
else:
print filename, 'loaded'
def store_new_songs(data, pickle_filename):
global pickle_data
if not not_load_save:
pickle_data = load_pickle(pickle_filename)
if not pickle_data:
pickle_data = []
try:
pickle_data.extend(list(filter(lambda x: x not in pickle_data, data)))
except Exception as e:
print e
exit()
else:
if not not_load_save:
save_pickle(pickle_data, pickle_filename, 'w')
def crawl_songs(url, song_tag_class, title_tag_class, artist_tag_class):
if (type(song_tag_class) or type(title_tag_class) or type(artist_tag_class)) is not list:
print song_tag_class, title_tag_class, artist_tag_class
raise Exception('Need song, title and artist to be in [tag, class] form')
global soup
soup = read_url_soup(url)
a = soup.find_all(song_tag_class[0], song_tag_class[1])
def soup_find(soup, tag, obj_class):
match = soup.find(tag, obj_class)
if match:
return match.get_text().strip().encode('utf-8')
else:
print soup
print '<{} {}> None? {}'.format(tag, obj_class, match)
return [[soup_find(x, title_tag_class[0], title_tag_class[1]), soup_find(x, artist_tag_class[0], artist_tag_class[1])] for x in a]
def get_next_url(url, next_url_tag_class):
global soup
if not soup:
soup = read_url_soup(url)
next_url = soup.find(next_url_tag_class[0], next_url_tag_class[1])
if next_url:
next_url = next_url.get('href')
elif not next_url:
print 'No next url'
return None
url = re.match('(http[s]?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*?\/', url).group(0)
return url + next_url[1:]
def crawl_website(url, song_tag_class, title_tag_class, artist_tag_class, next_url_tag_class, pickle_filename, recursion_times=5):
if type(next_url_tag_class) is not list:
raise Exception('Need url to be in [tag, class] form')
if recursion_times < 0:
raise Exception('recursion_times must be positive and its {}'.format(recursion_times))
global not_load_save
not_load_save = True
global pickle_data
pickle_data = load_pickle(pickle_filename)
#print pickle_data
#exit()
for iter in range(recursion_times):
print 'Iter', iter+1
store_new_songs(crawl_songs(url, song_tag_class, title_tag_class, artist_tag_class), pickle_filename)
if iter+1 != recursion_times:
print 'Getting next url...'
url = get_next_url(url, next_url_tag_class)
print url
print 'Finished storing new songs, current lenght {}\n'.format(len(pickle_data))
if not url:
print 'Ran out of urls!'
break
print 'Saving {} songs'.format(len(pickle_data))
save_pickle(pickle_data, pickle_filename, 'w')
print '|Done'
if __name__ == '__main__':
## options = {
## 'url': 'http://www.officialcharts.com/charts/singles-chart/',
## 'song_tag_class': ['div', 'track'],
## 'title_tag_class': ['div', 'title'],
## 'artist_tag_class': ['div', 'artist'],
## 'next_url_tag_class': ['a', 'chart-date-directions'],
## 'pickle_filename': 'songs.pickletest',
## 'iterations': 1
## }
options = {
'url': 'http://www.billboard.com/charts/dance-electronic-songs',
'song_tag_class': ['div', {'class': 'chart-row__container'}],
'title_tag_class': ['h2', {'class': 'chart-row__song'}],
'artist_tag_class': [['a','h3'], {'class': 'chart-row__artist'}],
'next_url_tag_class': ['a', {'class': 'chart-nav__link', 'data-tracklabel': "Week-previous"}],
'pickle_filename': 'songs2.pickle',
'iterations': 500
}
print 'All comment out'
## data = load_pickle(options['pickle_filename'])
## for i in data:
## print i[0], '-', i[1]
## print len(data)
crawl_website(options['url'], options['song_tag_class'], options['title_tag_class'],
options['artist_tag_class'],options['next_url_tag_class'], options['pickle_filename'],
options['iterations'])
import pickle
import sys
from crawler import load_pickle
data = load_pickle(sys.argv[1])
for i in data:
print i[0], '-', i[1]
print len(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment