Created
May 12, 2020 17:36
-
-
Save south1907/9ad26e860f78d452b836f431d634e682 to your computer and use it in GitHub Desktop.
Get all comment facebook plugin of all film (each category) phimmoi
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import time | |
from bs4 import BeautifulSoup | |
start_time = time.time() | |
headers = { | |
'cookie': 'fr=0shZ5eUbOjaYtgLs0..BeuWpJ...1.0.BeuWpJ.' | |
} | |
limit = 20 | |
def get_id_film(url_film): | |
params = {'href': url_film} | |
plugin_comment_root = 'https://www.facebook.com/plugins/feedback.php' | |
# request với param href = url_fillm | |
r = requests.get(plugin_comment_root, params=params) | |
resp = r.text | |
# tìm index targetFBID | |
start_index = resp.find('"targetFBID":"') + 14 | |
# tìm dấu nháy tiếp theo | |
end_index = resp.find('"', start_index) | |
# crop :) | |
id_film_fb = resp[start_index:end_index] | |
return id_film_fb | |
def get_page(film_id, after_cursor = ''): | |
data = { | |
'after_cursor': after_cursor, | |
'limit': limit, | |
'__a': '1' | |
} | |
url = 'https://www.facebook.com/plugins/comments/async/'+ film_id +'/pager/reverse_time/' | |
r = requests.post(url, headers=headers, data=data) | |
response = r.text[9:] | |
res_obj = json.loads(response) | |
list_comment = res_obj['payload']['idMap'] | |
r = [] | |
film_name = '' | |
for key in list_comment: | |
user_id = '' | |
user_name = '' | |
user_uri = '' | |
item = list_comment[key] | |
# 3 loại | |
if item['type'] == 'user': | |
user_id = item['id'] | |
user_name = item['name'] | |
user_uri = item['uri'] | |
if item['type'] == 'ogobject': | |
if film_name == '': | |
film_name = item['name'] | |
film_uri = item['uri'] | |
if item['type'] == 'comment': | |
comment_user = item['authorID'] | |
comment_content = item['body']['text'] | |
comment_timestamp = item['timestamp'] | |
temp = { | |
'comment_user': comment_user, | |
'comment_content': comment_content, | |
'comment_timestamp': comment_timestamp, | |
'film_id': film_id | |
} | |
r.append(temp) | |
return { | |
'data': r, | |
'next': res_obj['payload']['afterCursor'], | |
'film_name': film_name | |
} | |
def get_all_of_film(url_film): | |
results = [] | |
after_cursor = '' | |
film_id = get_id_film(url_film) | |
while 1: | |
# print('after_cursor: ' + after_cursor) | |
res = get_page(film_id, after_cursor) | |
# nếu không có comment nào nữa thì thoát | |
if len(res['data']) == 0: | |
break | |
# nếu không thì + vào results và request next page dựa vào after_cursor | |
results += res['data'] | |
after_cursor = res['next'] | |
return { | |
'data': results, | |
'film_id': film_id | |
} | |
def get_film_of_cate(cate, page): | |
domain = 'http://phimmoi.net/' | |
url_cate_film = 'http://www.phimmoi.net/the-loai/'+cate+'/page-'+str(page)+'.html' | |
r = requests.get(url_cate_film) | |
resp = r.text | |
soup = BeautifulSoup(resp, features="html.parser") | |
films = soup.findAll('a', {'class': 'block-wrapper'}) | |
results = [] | |
for item in films: | |
href = domain + item['href'] | |
title = item['title'].replace('/', ' ') | |
results.append({ | |
'href': href, | |
'title': title | |
}) | |
return results | |
cate_film = 'phim-hanh-dong' | |
page_film = 1 | |
list_film = get_film_of_cate(cate_film, page_film) | |
for film in list_film: | |
url_film = film['href'] | |
title_film = film['title'] | |
print(title_film) | |
all_data = get_all_of_film(url_film) | |
with open('data/' + title_film + '_' + all_data['film_id'] + '.json', 'w') as outfile: | |
json.dump(all_data['data'], outfile, indent=4, ensure_ascii=False) | |
end_time = time.time() | |
total_time = end_time - start_time | |
# print(total_time) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment