Created
August 24, 2017 15:49
-
-
Save KokaKiwi/bcb91c08f7d1a1252fc9ab4ea8bc1529 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Get Rick & Morty episodes ("rick" as in "https://ctoon.party/rick"): | |
> scrapy runspider ctoonparty.py -o rick.json -a show=rick | |
(add -a season=2 to get only one season) | |
Extract links only: | |
> jq '.[].url' -r < rick.json > rick.txt | |
> wget -c -i rick.txt | |
Extract links with subtitles: | |
> jq -r '.[]|.url,.tracks.en//empty' mlp.json > mlp.txt | |
""" | |
import scrapy | |
import re | |
def get_best_source(sources): | |
ordered = ['1080p', '720p'] | |
for o in ordered: | |
for s in sources: | |
label = s.css('::attr(label)').extract_first() | |
href = s.css('::attr(src)').extract_first() | |
if label == o: | |
return (label, href) | |
def get_tracks(tracks): | |
for track in tracks: | |
code = track.css('::attr(srclang)').extract_first() | |
href = track.css('::attr(src)').extract_first() | |
yield (code, href) | |
class CToonSpider(scrapy.Spider): | |
name = 'ctoon' | |
def __init__(self, *args, show=None, season=None, **kwargs): | |
if not show or '/' in show: | |
raise Exception("use -a show=<name>") | |
self.show = show | |
self.season = int(season) if season else None | |
self.base = 'https://ctoon.party/%s' % self.show | |
self.start_urls = [self.base] | |
def parse(self, response): | |
if re.match(self.base + '/[0-9a-z]+/?', response.url): | |
# Episode page | |
best = get_best_source(response.css('video source')) | |
if not best: | |
raise Exception() | |
tracks = dict(get_tracks(response.css('video track'))) | |
yield {'url': best[1], 'quality': best[0], 'tracks': tracks} | |
else: | |
# Index | |
if self.season: | |
f = '#collapse%02d' % self.season | |
else: | |
f = '' | |
for next_page in response.css(f + ' .ep-entry a'): | |
yield response.follow(next_page, self.parse) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment