-
-
Save henryjfry/8da2b90aa4a4ef09110625a56b2367c7 to your computer and use it in GitHub Desktop.
| import json | |
| import requests | |
| import time | |
| def get_imdb_videos(imdb_id): | |
| import re, requests | |
| API_URL = "https://graphql.prod.api.imdb.a2z.com/" | |
| HEADERS = { | |
| 'Referer': 'https://www.imdb.com/', | |
| 'Origin': 'https://www.imdb.com', | |
| 'User-Agent': 'Mozilla/5.0' | |
| } | |
| def gqlmin(q): | |
| return re.sub(' {4}', '', q) | |
| query_subpage = ''' | |
| query TitleVideoGallerySubPage( | |
| $const: ID!, | |
| $first: Int!, | |
| $filter: VideosQueryFilter, | |
| $sort: VideoSort | |
| ) { | |
| title(id: $const) { | |
| titleText { text } | |
| plot { plotText { plainText } } | |
| videoStrip(first: $first, filter: $filter, sort: $sort) { | |
| ...VideoGalleryItems | |
| } | |
| } | |
| } | |
| ''' | |
| query_pagination = ''' | |
| query TitleVideoGalleryPagination( | |
| $const: ID!, | |
| $first: Int!, | |
| $after: ID!, | |
| $filter: VideosQueryFilter, | |
| $sort: VideoSort | |
| ) { | |
| title(id: $const) { | |
| videoStrip(first: $first, after: $after, filter: $filter, sort: $sort) { | |
| ...VideoGalleryItems | |
| } | |
| } | |
| } | |
| ''' | |
| fragment = ''' | |
| fragment VideoGalleryItems on VideoConnection { | |
| pageInfo { | |
| endCursor | |
| hasNextPage | |
| } | |
| total | |
| edges { | |
| node { | |
| id | |
| contentType { id } | |
| name { value } | |
| runtime { value } | |
| thumbnail { url } | |
| primaryTitle { | |
| series { | |
| displayableEpisodeNumber { | |
| displayableSeason { | |
| season | |
| } | |
| } | |
| series { | |
| titleText { text } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| ''' | |
| variables = { | |
| "const": imdb_id, | |
| "first": 50, | |
| "filter": {"maturityLevel": "INCLUDE_MATURE","nameConstraints":{},"titleConstraints":{},"types":["TRAILER"]}, | |
| "sort": {"by": "DATE", "order": "DESC"} | |
| } | |
| videos = [] | |
| plot_text = "" | |
| item_title = "" | |
| total_videos = None | |
| # First page | |
| pdata = { | |
| 'operationName': "TitleVideoGallerySubPage", | |
| 'query': gqlmin(query_subpage + fragment), | |
| 'variables': variables | |
| } | |
| r = requests.post(API_URL, headers=HEADERS, json=pdata) | |
| r.raise_for_status() | |
| json_data = r.json() | |
| title_data = json_data.get('data', {}).get('title', {}) | |
| plot_text = title_data.get('plot', {}).get('plotText', {}).get('plainText', "") | |
| item_title = title_data.get('titleText', {}).get('text', "") | |
| video_data = title_data.get('videoStrip', {}) | |
| total_videos = video_data.get('total') | |
| videos.extend([edge.get('node', {}) for edge in video_data.get('edges', [])]) | |
| cursor = video_data.get('pageInfo', {}).get('endCursor') | |
| has_next = video_data.get('pageInfo', {}).get('hasNextPage', False) | |
| # Pagination loop | |
| while has_next and cursor: | |
| variables["after"] = cursor | |
| pdata = { | |
| 'operationName': "TitleVideoGalleryPagination", | |
| 'query': gqlmin(query_pagination + fragment), | |
| 'variables': variables | |
| } | |
| r = requests.post(API_URL, headers=HEADERS, json=pdata) | |
| r.raise_for_status() | |
| video_data = r.json().get('data', {}).get('title', {}).get('videoStrip', {}) | |
| videos.extend([edge.get('node', {}) for edge in video_data.get('edges', [])]) | |
| cursor = video_data.get('pageInfo', {}).get('endCursor') | |
| has_next = video_data.get('pageInfo', {}).get('hasNextPage', False) | |
| time.sleep(0.3) | |
| # Match old output: inject plot, total, and item_title | |
| for idx, v in enumerate(videos): | |
| v["plot"] = plot_text | |
| v["total"] = total_videos | |
| v["item_title"] = item_title | |
| videos[idx] = v | |
| return videos | |
| def time_format(seconds: int) -> str: | |
| if seconds is not None: | |
| seconds = int(seconds) | |
| d = seconds // (3600 * 24) | |
| h = seconds // 3600 % 24 | |
| m = seconds % 3600 // 60 | |
| s = seconds % 3600 % 60 | |
| if d > 0: | |
| return '{:02d}D {:02d}H {:02d}m {:02d}s'.format(d, h, m, s) | |
| elif h > 0: | |
| return '{:02d}H {:02d}m {:02d}s'.format(h, m, s) | |
| elif m > 0: | |
| return '{:02d}m {:02d}s'.format(m, s) | |
| elif s > 0: | |
| return '{:02d}s'.format(s) | |
| return '-' | |
| import re | |
| def extract_season_number(title): | |
| # Match "Season" or "Series" followed by optional spaces, optional punctuation, and digits | |
| pattern = r"(:?.*(?:Season|Series))(?:\s*\d*)" | |
| match = re.search(pattern, title, re.IGNORECASE) | |
| try: extract_season_number = int(match.group(0).replace(match.group(1),'').strip()) | |
| except: extract_season_number = None | |
| return extract_season_number | |
| def find_best_trailer(trailer_list, season_number=None): | |
| if len(trailer_list) == 0: | |
| return None | |
| best_match = None | |
| best_score = -1 | |
| fallback_thumbnail = None | |
| trailer_list = sorted(trailer_list, key=lambda x: x['runtime']['value'], reverse=True) | |
| match_list = [] | |
| new_trailer_list = [] | |
| season_list = [] | |
| official_flag = False | |
| theatrical_list = ['theatrical','full','final'] | |
| theatrical_flag = False | |
| titleText = None | |
| for trailer in trailer_list: | |
| if trailer['contentType']['id'] == 'amzn1.imdb.video.contenttype.trailer': | |
| curr_dict = {} | |
| if trailer['primaryTitle'].get('series',{}) != {}: | |
| try: season = int(trailer['primaryTitle']['series']['displayableEpisodeNumber']['displayableSeason']['season']) | |
| except: season = None | |
| #print(trailer) | |
| curr_dict['id'] = trailer['id'] | |
| curr_dict['vid_url'] = 'https://www.imdb.com/video/%s/?ref_=ttvg_vi_1' % (str(trailer['id'])) | |
| curr_dict['season'] = season | |
| curr_dict['title'] = trailer['name']['value'] | |
| if season: | |
| titleText = trailer['primaryTitle']['series']['series']['titleText']['text'] | |
| if not season: | |
| season = extract_season_number(curr_dict['title']) | |
| if season: | |
| curr_dict['season'] = season | |
| if any(word in str(curr_dict['title']).lower() for word in theatrical_list): | |
| curr_dict['theatrical'] = True | |
| theatrical_flag = True | |
| else: | |
| curr_dict['theatrical'] = False | |
| if 'official' in str(curr_dict['title']).lower(): | |
| curr_dict['official'] = True | |
| official_flag = True | |
| if season: | |
| official_flag = False | |
| curr_dict['official'] = False | |
| else: | |
| curr_dict['official'] = False | |
| if season and not season in season_list: | |
| season_list.append(season) | |
| curr_dict['thumbnail'] = trailer['thumbnail']['url'] | |
| curr_dict['runtime'] = trailer['runtime']['value'] | |
| curr_dict['time'] = time_format(trailer['runtime']['value']) | |
| #print(curr_dict['title']) | |
| new_trailer_list.append(curr_dict) | |
| if season_number and season_number in season_list: | |
| season_match = True | |
| elif season_list != []: | |
| if season_number: | |
| for i in reversed(sorted(season_list)): | |
| if i <= season_number: | |
| break | |
| season_match = i | |
| else: | |
| season_match = False | |
| else: | |
| season_match = False | |
| if type(season_match) == type(season_number): | |
| if season_match > season_number: | |
| season_match = False | |
| offical_trailer = None | |
| season_trailer = None | |
| if season_match == True and type(season_match) == type(True): | |
| for trailer in new_trailer_list: | |
| if trailer['season'] == season_number: | |
| season_trailer = trailer | |
| break | |
| elif season_match == False: | |
| season_trailer = new_trailer_list[0] | |
| else: | |
| for trailer in new_trailer_list: | |
| if trailer['season'] == season_match: | |
| season_trailer = trailer | |
| break | |
| if theatrical_flag == True: | |
| for trailer in new_trailer_list: | |
| if trailer['theatrical']: | |
| offical_trailer = trailer | |
| break | |
| elif official_flag == True: | |
| for trailer in new_trailer_list: | |
| if trailer['official'] and not 'teaser' in str(trailer['title']).lower(): | |
| offical_trailer = trailer | |
| break | |
| if not offical_trailer: | |
| for trailer in new_trailer_list: | |
| if trailer['official']: | |
| offical_trailer = trailer | |
| break | |
| elif titleText: | |
| for trailer in new_trailer_list: | |
| if trailer['title'] == titleText: | |
| offical_trailer = trailer | |
| break | |
| if offical_trailer and official_flag: | |
| if season_match == False or season_trailer == None: | |
| season_trailer = offical_trailer | |
| elif official_flag == False and offical_trailer: | |
| if season_match == False: | |
| season_trailer = offical_trailer | |
| #print(new_trailer_list) | |
| #print(titleText) | |
| return season_trailer | |
| def extract_imdb_mp4_url(video_id): | |
| url = f"https://www.imdb.com/video/{video_id}?ref_=ttvg_vi_26" | |
| headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} | |
| response = requests.get(url, headers=headers) | |
| if response.status_code != 200: | |
| raise Exception(f"Failed to fetch page: {response.status_code}") | |
| html = response.text | |
| PlaybackURL = ('[' + html.split('"playbackURLs":[')[1].split('}]')[0] + '}]') | |
| url = None | |
| for i in eval(PlaybackURL): | |
| if i['videoMimeType'] == 'MP4': | |
| return i['url'], i | |
| else: | |
| if not url: | |
| url = i['url'] | |
| video = i | |
| #print(i['videoDefinition']) | |
| #print(i['videoMimeType']) | |
| return url, video | |
| all_videos = get_imdb_videos(imdb_id='tt4532368') | |
| #print(all_videos) | |
| best_trailer = find_best_trailer(all_videos, season_number=None) | |
| if best_trailer: | |
| print(best_trailer) | |
| print(best_trailer['title']) | |
| video_url, video = extract_imdb_mp4_url(best_trailer['id']) | |
| print("MP4 URL:", video_url) | |
| print(video) | |
| exit() | |
| #print(json.dumps(all_videos[:3], indent=2)) # Show first 3 videos | |
| print(f"Total videos fetched: {len(all_videos)}") | |
| print(all_videos) | |
| for i in all_videos: | |
| if 'contenttype.trailer' in str(i) and 'season' in str(i['name']['value']).lower(): | |
| print(i['name']['value'],' - ' ,time_format(i['runtime']['value'])) |
Yeah I don't do too much Ai coding either but as I don't know JavaScript at all I never would have been able to figure out how to make a compliant query without it.
But I started out trial and error in the terminal too.
Although I now do programming adjacent stuff for work. We don't really use AI as there are GDPR data issues involved but the copilot thing is new and is our own instance I believe so we've been curious about the capabilities and trialing it a bit in our area.
But I mostly do SQL on random data so ai isn't much help as half the thing is figuring out what you are looking at.
But when I am doing random things it is pretty handy to now have access to an interactive stack overflow on steroids.
Does it hallucinate occasionally, yes. But can it parse badly documented/undocumented code and give good info back, yes surprisingly it can. And getting working example code relevant to your problem when you go googling is often half the battle so it's definitely a valuable tool.
FYI the video properties returned contain contentType which has a trailer/clip info like "amzn1.imdb.video.contenttype.trailer" which might be what you need?
Otherwise maybe the query has those as variables inputs?
They might be called nameSearchConstraints?
That difference (IE search) was an error I saw myself (did you mean...)
Yes I already tried contenttype as a filter key and it came with incorrect parameter response, so yeah that is exactly what I am targeting and yet to figure out
Think this should be what you need:
"filter": {"maturityLevel": "INCLUDE_MATURE","nameConstraints":{},"titleConstraints":{},"types":["TRAILER"]},
import json
import requests
import time
def get_imdb_videos(imdb_id):
import re, requests
API_URL = "https://graphql.prod.api.imdb.a2z.com/"
HEADERS = {
'Referer': 'https://www.imdb.com/',
'Origin': 'https://www.imdb.com',
'User-Agent': 'Mozilla/5.0'
}
def gqlmin(q):
return re.sub(' {4}', '', q)
query_subpage = '''
query TitleVideoGallerySubPage(
$const: ID!,
$first: Int!,
$filter: VideosQueryFilter,
$sort: VideoSort
) {
title(id: $const) {
titleText { text }
plot { plotText { plainText } }
videoStrip(first: $first, filter: $filter, sort: $sort) {
...VideoGalleryItems
}
}
}
'''
query_pagination = '''
query TitleVideoGalleryPagination(
$const: ID!,
$first: Int!,
$after: ID!,
$filter: VideosQueryFilter,
$sort: VideoSort
) {
title(id: $const) {
videoStrip(first: $first, after: $after, filter: $filter, sort: $sort) {
...VideoGalleryItems
}
}
}
'''
fragment = '''
fragment VideoGalleryItems on VideoConnection {
pageInfo {
endCursor
hasNextPage
}
total
edges {
node {
id
contentType { id }
name { value }
runtime { value }
thumbnail { url }
primaryTitle {
series {
displayableEpisodeNumber {
displayableSeason {
season
}
}
series {
titleText { text }
}
}
}
}
}
}
'''
variables = {
"const": imdb_id,
"first": 50,
"filter": {"maturityLevel": "INCLUDE_MATURE","nameConstraints":{},"titleConstraints":{},"types":["TRAILER"]},
"sort": {"by": "DATE", "order": "DESC"}
}
videos = []
plot_text = ""
item_title = ""
total_videos = None
# First page
pdata = {
'operationName': "TitleVideoGallerySubPage",
'query': gqlmin(query_subpage + fragment),
'variables': variables
}
r = requests.post(API_URL, headers=HEADERS, json=pdata)
r.raise_for_status()
json_data = r.json()
title_data = json_data.get('data', {}).get('title', {})
plot_text = title_data.get('plot', {}).get('plotText', {}).get('plainText', "")
item_title = title_data.get('titleText', {}).get('text', "")
video_data = title_data.get('videoStrip', {})
total_videos = video_data.get('total')
videos.extend([edge.get('node', {}) for edge in video_data.get('edges', [])])
cursor = video_data.get('pageInfo', {}).get('endCursor')
has_next = video_data.get('pageInfo', {}).get('hasNextPage', False)
# Pagination loop
while has_next and cursor:
variables["after"] = cursor
pdata = {
'operationName': "TitleVideoGalleryPagination",
'query': gqlmin(query_pagination + fragment),
'variables': variables
}
r = requests.post(API_URL, headers=HEADERS, json=pdata)
r.raise_for_status()
video_data = r.json().get('data', {}).get('title', {}).get('videoStrip', {})
videos.extend([edge.get('node', {}) for edge in video_data.get('edges', [])])
cursor = video_data.get('pageInfo', {}).get('endCursor')
has_next = video_data.get('pageInfo', {}).get('hasNextPage', False)
time.sleep(0.3)
# Match old output: inject plot, total, and item_title
for idx, v in enumerate(videos):
v["plot"] = plot_text
v["total"] = total_videos
v["item_title"] = item_title
videos[idx] = v
return videos
all_videos = get_imdb_videos(imdb_id='tt11280740')
print(all_videos)
exit()
however other than "types", nameConstraints are "nameConstraints":{allNameIds":["nm0004395","nm3138882"]} "
And titleconstraints:
nameConstraints: {
allNameIds: r.nameIds?.sort( (e, t) => e.localeCompare(t))
},
titleConstraints: {
anyTitleIds: r.titleIds?.sort( (e, t) => e.localeCompare(t))
ie anyTitleIds being - tt11280740 eg IMDB ids. So not actually "video clip name" or "video clip title"
found a small issue when a trailer contains Final but is for final season eg. Beter Call Sault "Better Call Saul: A Look At The Final Season" It gets picked up as theatrical trailer To stop this, i just did a 2nd check that it doesnt contain season
checkout Gujal00/Kodi-Official@c556e4e
I've provided working API lookups for all the pages currently scraped:
VideoPlayback => https://www.imdb.com/video/vi1020905497/?ref_=ttvg_vi_1
CalendarPage => https://www.imdb.com/calendar/?region=US&type=MOVIE&ref_=rlm
movies_near_you => https://www.imdb.com/showtimes/
I stay away from AI coding and I am not a programmer by profession either. Just trial and error in Python at hobbyist level to get things going :)
Ideally i would like to find out the nameConstraints or titleConstraints to filter only trailers to be returned rather than all videos, but havent been able to figure that out yet, may be fellow kiwi @matthuisman has some ideas