-
-
Save henryjfry/8da2b90aa4a4ef09110625a56b2367c7 to your computer and use it in GitHub Desktop.
| import json | |
| import requests | |
| import time | |
| def get_imdb_videos(imdb_id): | |
| import re, requests | |
| API_URL = "https://graphql.prod.api.imdb.a2z.com/" | |
| HEADERS = { | |
| 'Referer': 'https://www.imdb.com/', | |
| 'Origin': 'https://www.imdb.com', | |
| 'User-Agent': 'Mozilla/5.0' | |
| } | |
| def gqlmin(q): | |
| return re.sub(' {4}', '', q) | |
| query_subpage = ''' | |
| query TitleVideoGallerySubPage( | |
| $const: ID!, | |
| $first: Int!, | |
| $filter: VideosQueryFilter, | |
| $sort: VideoSort | |
| ) { | |
| title(id: $const) { | |
| titleText { text } | |
| plot { plotText { plainText } } | |
| videoStrip(first: $first, filter: $filter, sort: $sort) { | |
| ...VideoGalleryItems | |
| } | |
| } | |
| } | |
| ''' | |
| query_pagination = ''' | |
| query TitleVideoGalleryPagination( | |
| $const: ID!, | |
| $first: Int!, | |
| $after: ID!, | |
| $filter: VideosQueryFilter, | |
| $sort: VideoSort | |
| ) { | |
| title(id: $const) { | |
| videoStrip(first: $first, after: $after, filter: $filter, sort: $sort) { | |
| ...VideoGalleryItems | |
| } | |
| } | |
| } | |
| ''' | |
| fragment = ''' | |
| fragment VideoGalleryItems on VideoConnection { | |
| pageInfo { | |
| endCursor | |
| hasNextPage | |
| } | |
| total | |
| edges { | |
| node { | |
| id | |
| contentType { id } | |
| name { value } | |
| runtime { value } | |
| thumbnail { url } | |
| primaryTitle { | |
| series { | |
| displayableEpisodeNumber { | |
| displayableSeason { | |
| season | |
| } | |
| } | |
| series { | |
| titleText { text } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| ''' | |
| variables = { | |
| "const": imdb_id, | |
| "first": 50, | |
| "filter": {"maturityLevel": "INCLUDE_MATURE","nameConstraints":{},"titleConstraints":{},"types":["TRAILER"]}, | |
| "sort": {"by": "DATE", "order": "DESC"} | |
| } | |
| videos = [] | |
| plot_text = "" | |
| item_title = "" | |
| total_videos = None | |
| # First page | |
| pdata = { | |
| 'operationName': "TitleVideoGallerySubPage", | |
| 'query': gqlmin(query_subpage + fragment), | |
| 'variables': variables | |
| } | |
| r = requests.post(API_URL, headers=HEADERS, json=pdata) | |
| r.raise_for_status() | |
| json_data = r.json() | |
| title_data = json_data.get('data', {}).get('title', {}) | |
| plot_text = title_data.get('plot', {}).get('plotText', {}).get('plainText', "") | |
| item_title = title_data.get('titleText', {}).get('text', "") | |
| video_data = title_data.get('videoStrip', {}) | |
| total_videos = video_data.get('total') | |
| videos.extend([edge.get('node', {}) for edge in video_data.get('edges', [])]) | |
| cursor = video_data.get('pageInfo', {}).get('endCursor') | |
| has_next = video_data.get('pageInfo', {}).get('hasNextPage', False) | |
| # Pagination loop | |
| while has_next and cursor: | |
| variables["after"] = cursor | |
| pdata = { | |
| 'operationName': "TitleVideoGalleryPagination", | |
| 'query': gqlmin(query_pagination + fragment), | |
| 'variables': variables | |
| } | |
| r = requests.post(API_URL, headers=HEADERS, json=pdata) | |
| r.raise_for_status() | |
| video_data = r.json().get('data', {}).get('title', {}).get('videoStrip', {}) | |
| videos.extend([edge.get('node', {}) for edge in video_data.get('edges', [])]) | |
| cursor = video_data.get('pageInfo', {}).get('endCursor') | |
| has_next = video_data.get('pageInfo', {}).get('hasNextPage', False) | |
| time.sleep(0.3) | |
| # Match old output: inject plot, total, and item_title | |
| for idx, v in enumerate(videos): | |
| v["plot"] = plot_text | |
| v["total"] = total_videos | |
| v["item_title"] = item_title | |
| videos[idx] = v | |
| return videos | |
| def time_format(seconds: int) -> str: | |
| if seconds is not None: | |
| seconds = int(seconds) | |
| d = seconds // (3600 * 24) | |
| h = seconds // 3600 % 24 | |
| m = seconds % 3600 // 60 | |
| s = seconds % 3600 % 60 | |
| if d > 0: | |
| return '{:02d}D {:02d}H {:02d}m {:02d}s'.format(d, h, m, s) | |
| elif h > 0: | |
| return '{:02d}H {:02d}m {:02d}s'.format(h, m, s) | |
| elif m > 0: | |
| return '{:02d}m {:02d}s'.format(m, s) | |
| elif s > 0: | |
| return '{:02d}s'.format(s) | |
| return '-' | |
| import re | |
| def extract_season_number(title): | |
| # Match "Season" or "Series" followed by optional spaces, optional punctuation, and digits | |
| pattern = r"(:?.*(?:Season|Series))(?:\s*\d*)" | |
| match = re.search(pattern, title, re.IGNORECASE) | |
| try: extract_season_number = int(match.group(0).replace(match.group(1),'').strip()) | |
| except: extract_season_number = None | |
| return extract_season_number | |
| def find_best_trailer(trailer_list, season_number=None): | |
| if len(trailer_list) == 0: | |
| return None | |
| best_match = None | |
| best_score = -1 | |
| fallback_thumbnail = None | |
| trailer_list = sorted(trailer_list, key=lambda x: x['runtime']['value'], reverse=True) | |
| match_list = [] | |
| new_trailer_list = [] | |
| season_list = [] | |
| official_flag = False | |
| theatrical_list = ['theatrical','full','final'] | |
| theatrical_flag = False | |
| titleText = None | |
| for trailer in trailer_list: | |
| if trailer['contentType']['id'] == 'amzn1.imdb.video.contenttype.trailer': | |
| curr_dict = {} | |
| if trailer['primaryTitle'].get('series',{}) != {}: | |
| try: season = int(trailer['primaryTitle']['series']['displayableEpisodeNumber']['displayableSeason']['season']) | |
| except: season = None | |
| #print(trailer) | |
| curr_dict['id'] = trailer['id'] | |
| curr_dict['vid_url'] = 'https://www.imdb.com/video/%s/?ref_=ttvg_vi_1' % (str(trailer['id'])) | |
| curr_dict['season'] = season | |
| curr_dict['title'] = trailer['name']['value'] | |
| if season: | |
| titleText = trailer['primaryTitle']['series']['series']['titleText']['text'] | |
| if not season: | |
| season = extract_season_number(curr_dict['title']) | |
| if season: | |
| curr_dict['season'] = season | |
| if any(word in str(curr_dict['title']).lower() for word in theatrical_list): | |
| curr_dict['theatrical'] = True | |
| theatrical_flag = True | |
| else: | |
| curr_dict['theatrical'] = False | |
| if 'official' in str(curr_dict['title']).lower(): | |
| curr_dict['official'] = True | |
| official_flag = True | |
| if season: | |
| official_flag = False | |
| curr_dict['official'] = False | |
| else: | |
| curr_dict['official'] = False | |
| if season and not season in season_list: | |
| season_list.append(season) | |
| curr_dict['thumbnail'] = trailer['thumbnail']['url'] | |
| curr_dict['runtime'] = trailer['runtime']['value'] | |
| curr_dict['time'] = time_format(trailer['runtime']['value']) | |
| #print(curr_dict['title']) | |
| new_trailer_list.append(curr_dict) | |
| if season_number and season_number in season_list: | |
| season_match = True | |
| elif season_list != []: | |
| if season_number: | |
| for i in reversed(sorted(season_list)): | |
| if i <= season_number: | |
| break | |
| season_match = i | |
| else: | |
| season_match = False | |
| else: | |
| season_match = False | |
| if type(season_match) == type(season_number): | |
| if season_match > season_number: | |
| season_match = False | |
| offical_trailer = None | |
| season_trailer = None | |
| if season_match == True and type(season_match) == type(True): | |
| for trailer in new_trailer_list: | |
| if trailer['season'] == season_number: | |
| season_trailer = trailer | |
| break | |
| elif season_match == False: | |
| season_trailer = new_trailer_list[0] | |
| else: | |
| for trailer in new_trailer_list: | |
| if trailer['season'] == season_match: | |
| season_trailer = trailer | |
| break | |
| if theatrical_flag == True: | |
| for trailer in new_trailer_list: | |
| if trailer['theatrical']: | |
| offical_trailer = trailer | |
| break | |
| elif official_flag == True: | |
| for trailer in new_trailer_list: | |
| if trailer['official'] and not 'teaser' in str(trailer['title']).lower(): | |
| offical_trailer = trailer | |
| break | |
| if not offical_trailer: | |
| for trailer in new_trailer_list: | |
| if trailer['official']: | |
| offical_trailer = trailer | |
| break | |
| elif titleText: | |
| for trailer in new_trailer_list: | |
| if trailer['title'] == titleText: | |
| offical_trailer = trailer | |
| break | |
| if offical_trailer and official_flag: | |
| if season_match == False or season_trailer == None: | |
| season_trailer = offical_trailer | |
| elif official_flag == False and offical_trailer: | |
| if season_match == False: | |
| season_trailer = offical_trailer | |
| #print(new_trailer_list) | |
| #print(titleText) | |
| return season_trailer | |
| def extract_imdb_mp4_url(video_id): | |
| url = f"https://www.imdb.com/video/{video_id}?ref_=ttvg_vi_26" | |
| headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} | |
| response = requests.get(url, headers=headers) | |
| if response.status_code != 200: | |
| raise Exception(f"Failed to fetch page: {response.status_code}") | |
| html = response.text | |
| PlaybackURL = ('[' + html.split('"playbackURLs":[')[1].split('}]')[0] + '}]') | |
| url = None | |
| for i in eval(PlaybackURL): | |
| if i['videoMimeType'] == 'MP4': | |
| return i['url'], i | |
| else: | |
| if not url: | |
| url = i['url'] | |
| video = i | |
| #print(i['videoDefinition']) | |
| #print(i['videoMimeType']) | |
| return url, video | |
| all_videos = get_imdb_videos(imdb_id='tt4532368') | |
| #print(all_videos) | |
| best_trailer = find_best_trailer(all_videos, season_number=None) | |
| if best_trailer: | |
| print(best_trailer) | |
| print(best_trailer['title']) | |
| video_url, video = extract_imdb_mp4_url(best_trailer['id']) | |
| print("MP4 URL:", video_url) | |
| print(video) | |
| exit() | |
| #print(json.dumps(all_videos[:3], indent=2)) # Show first 3 videos | |
| print(f"Total videos fetched: {len(all_videos)}") | |
| print(all_videos) | |
| for i in all_videos: | |
| if 'contenttype.trailer' in str(i) and 'season' in str(i['name']['value']).lower(): | |
| print(i['name']['value'],' - ' ,time_format(i['runtime']['value'])) |
Think this should be what you need:
"filter": {"maturityLevel": "INCLUDE_MATURE","nameConstraints":{},"titleConstraints":{},"types":["TRAILER"]},
import json
import requests
import time
def get_imdb_videos(imdb_id):
import re, requests
API_URL = "https://graphql.prod.api.imdb.a2z.com/"
HEADERS = {
'Referer': 'https://www.imdb.com/',
'Origin': 'https://www.imdb.com',
'User-Agent': 'Mozilla/5.0'
}
def gqlmin(q):
return re.sub(' {4}', '', q)
query_subpage = '''
query TitleVideoGallerySubPage(
$const: ID!,
$first: Int!,
$filter: VideosQueryFilter,
$sort: VideoSort
) {
title(id: $const) {
titleText { text }
plot { plotText { plainText } }
videoStrip(first: $first, filter: $filter, sort: $sort) {
...VideoGalleryItems
}
}
}
'''
query_pagination = '''
query TitleVideoGalleryPagination(
$const: ID!,
$first: Int!,
$after: ID!,
$filter: VideosQueryFilter,
$sort: VideoSort
) {
title(id: $const) {
videoStrip(first: $first, after: $after, filter: $filter, sort: $sort) {
...VideoGalleryItems
}
}
}
'''
fragment = '''
fragment VideoGalleryItems on VideoConnection {
pageInfo {
endCursor
hasNextPage
}
total
edges {
node {
id
contentType { id }
name { value }
runtime { value }
thumbnail { url }
primaryTitle {
series {
displayableEpisodeNumber {
displayableSeason {
season
}
}
series {
titleText { text }
}
}
}
}
}
}
'''
variables = {
"const": imdb_id,
"first": 50,
"filter": {"maturityLevel": "INCLUDE_MATURE","nameConstraints":{},"titleConstraints":{},"types":["TRAILER"]},
"sort": {"by": "DATE", "order": "DESC"}
}
videos = []
plot_text = ""
item_title = ""
total_videos = None
# First page
pdata = {
'operationName': "TitleVideoGallerySubPage",
'query': gqlmin(query_subpage + fragment),
'variables': variables
}
r = requests.post(API_URL, headers=HEADERS, json=pdata)
r.raise_for_status()
json_data = r.json()
title_data = json_data.get('data', {}).get('title', {})
plot_text = title_data.get('plot', {}).get('plotText', {}).get('plainText', "")
item_title = title_data.get('titleText', {}).get('text', "")
video_data = title_data.get('videoStrip', {})
total_videos = video_data.get('total')
videos.extend([edge.get('node', {}) for edge in video_data.get('edges', [])])
cursor = video_data.get('pageInfo', {}).get('endCursor')
has_next = video_data.get('pageInfo', {}).get('hasNextPage', False)
# Pagination loop
while has_next and cursor:
variables["after"] = cursor
pdata = {
'operationName': "TitleVideoGalleryPagination",
'query': gqlmin(query_pagination + fragment),
'variables': variables
}
r = requests.post(API_URL, headers=HEADERS, json=pdata)
r.raise_for_status()
video_data = r.json().get('data', {}).get('title', {}).get('videoStrip', {})
videos.extend([edge.get('node', {}) for edge in video_data.get('edges', [])])
cursor = video_data.get('pageInfo', {}).get('endCursor')
has_next = video_data.get('pageInfo', {}).get('hasNextPage', False)
time.sleep(0.3)
# Match old output: inject plot, total, and item_title
for idx, v in enumerate(videos):
v["plot"] = plot_text
v["total"] = total_videos
v["item_title"] = item_title
videos[idx] = v
return videos
all_videos = get_imdb_videos(imdb_id='tt11280740')
print(all_videos)
exit()
however other than "types", nameConstraints are "nameConstraints":{allNameIds":["nm0004395","nm3138882"]} "
And titleconstraints:
nameConstraints: {
allNameIds: r.nameIds?.sort( (e, t) => e.localeCompare(t))
},
titleConstraints: {
anyTitleIds: r.titleIds?.sort( (e, t) => e.localeCompare(t))
ie anyTitleIds being - tt11280740 eg IMDB ids. So not actually "video clip name" or "video clip title"
found a small issue when a trailer contains Final but is for final season eg. Beter Call Sault "Better Call Saul: A Look At The Final Season" It gets picked up as theatrical trailer To stop this, i just did a 2nd check that it doesnt contain season
checkout Gujal00/Kodi-Official@c556e4e
I've provided working API lookups for all the pages currently scraped:
VideoPlayback => https://www.imdb.com/video/vi1020905497/?ref_=ttvg_vi_1
CalendarPage => https://www.imdb.com/calendar/?region=US&type=MOVIE&ref_=rlm
movies_near_you => https://www.imdb.com/showtimes/
Yes I already tried contenttype as a filter key and it came with incorrect parameter response, so yeah that is exactly what I am targeting and yet to figure out