south1907 · May 12, 2020 17:36
diff --git a/get_ful_comment.py b/get_ful_comment.py
 import requests
 import json
 import time
 from bs4 import BeautifulSoup

 start_time = time.time()

 headers = {
 	'cookie': 'fr=0shZ5eUbOjaYtgLs0..BeuWpJ...1.0.BeuWpJ.'
 }
 limit = 20

 def get_id_film(url_film):

 	params = {'href': url_film}
 	plugin_comment_root = 'https://www.facebook.com/plugins/feedback.php'

 	# request với param href = url_fillm
 	r = requests.get(plugin_comment_root, params=params)

 	resp = r.text

 	# tìm index targetFBID
 	start_index = resp.find('"targetFBID":"') + 14

 	# tìm dấu nháy tiếp theo
 	end_index = resp.find('"', start_index)

 	# crop :)
 	id_film_fb = resp[start_index:end_index]

 	return id_film_fb

 def get_page(film_id, after_cursor = ''):

 	data = {
 		'after_cursor': after_cursor,
 		'limit': limit,
 		'__a': '1'
 	}

 	url = 'https://www.facebook.com/plugins/comments/async/'+ film_id +'/pager/reverse_time/'

 	r = requests.post(url, headers=headers, data=data)
 	response = r.text[9:]
 	res_obj = json.loads(response)

 	list_comment = res_obj['payload']['idMap']

 	r = []
 	film_name = ''

 	for key in list_comment:

 		user_id = ''
 		user_name = ''
 		user_uri = ''

 		item = list_comment[key]

 		# 3 loại 
 		if item['type'] == 'user':
 			user_id = item['id']
 			user_name = item['name']
 			user_uri = item['uri']

 		if item['type'] == 'ogobject':
 			if film_name == '':
 				film_name = item['name']
 				film_uri = item['uri']

 		if item['type'] == 'comment':
 			comment_user = item['authorID']
 			comment_content = item['body']['text']
 			comment_timestamp = item['timestamp']

 			temp = {
 				'comment_user': comment_user,
 				'comment_content': comment_content,
 				'comment_timestamp': comment_timestamp,
 				'film_id': film_id
 			}

 			r.append(temp)

 	return {
 		'data': r,
 		'next': res_obj['payload']['afterCursor'],
 		'film_name': film_name
 	}

 def get_all_of_film(url_film):
 	results = []
 	after_cursor = ''
 	film_id = get_id_film(url_film)

 	while 1:
 		# print('after_cursor: ' + after_cursor)
 		res = get_page(film_id, after_cursor)

 		# nếu không có comment nào nữa thì thoát
 		if len(res['data']) == 0:
 			break

 		# nếu không thì + vào results và request next page dựa vào after_cursor
 		results += res['data']
 		after_cursor = res['next']

 	return {
 		'data': results,
 		'film_id': film_id
 	}

 def get_film_of_cate(cate, page):
 	domain = 'http://phimmoi.net/'
 	url_cate_film = 'http://www.phimmoi.net/the-loai/'+cate+'/page-'+str(page)+'.html'

 	r = requests.get(url_cate_film)
 	resp = r.text
 	soup = BeautifulSoup(resp, features="html.parser")

 	films = soup.findAll('a', {'class': 'block-wrapper'})

 	results = []
 	for item in films:
 		href = domain + item['href']
 		title = item['title'].replace('/', ' ')
 		results.append({
 			'href': href,
 			'title': title
 		})

 	return results

 cate_film = 'phim-hanh-dong'
 page_film = 1

 list_film = get_film_of_cate(cate_film, page_film)

 for film in list_film:
 	url_film = film['href']
 	title_film = film['title']

 	print(title_film)
 	all_data = get_all_of_film(url_film)

 	with open('data/' + title_film + '_' + all_data['film_id'] + '.json', 'w') as outfile:
 	    json.dump(all_data['data'], outfile, indent=4, ensure_ascii=False)
 end_time = time.time()

 total_time = end_time - start_time

 # print(total_time)
	import requests
	import json
	import time
	from bs4 import BeautifulSoup

	start_time = time.time()

	headers = {
	'cookie': 'fr=0shZ5eUbOjaYtgLs0..BeuWpJ...1.0.BeuWpJ.'
	}
	limit = 20

	def get_id_film(url_film):

	params = {'href': url_film}
	plugin_comment_root = 'https://www.facebook.com/plugins/feedback.php'

	# request với param href = url_fillm
	r = requests.get(plugin_comment_root, params=params)

	resp = r.text

	# tìm index targetFBID
	start_index = resp.find('"targetFBID":"') + 14

	# tìm dấu nháy tiếp theo
	end_index = resp.find('"', start_index)

	# crop :)
	id_film_fb = resp[start_index:end_index]

	return id_film_fb

	def get_page(film_id, after_cursor = ''):

	data = {
	'after_cursor': after_cursor,
	'limit': limit,
	'__a': '1'
	}

	url = 'https://www.facebook.com/plugins/comments/async/'+ film_id +'/pager/reverse_time/'

	r = requests.post(url, headers=headers, data=data)
	response = r.text[9:]
	res_obj = json.loads(response)

	list_comment = res_obj['payload']['idMap']

	r = []
	film_name = ''

	for key in list_comment:

	user_id = ''
	user_name = ''
	user_uri = ''

	item = list_comment[key]

	# 3 loại
	if item['type'] == 'user':
	user_id = item['id']
	user_name = item['name']
	user_uri = item['uri']

	if item['type'] == 'ogobject':
	if film_name == '':
	film_name = item['name']
	film_uri = item['uri']

	if item['type'] == 'comment':
	comment_user = item['authorID']
	comment_content = item['body']['text']
	comment_timestamp = item['timestamp']

	temp = {
	'comment_user': comment_user,
	'comment_content': comment_content,
	'comment_timestamp': comment_timestamp,
	'film_id': film_id
	}

	r.append(temp)

	return {
	'data': r,
	'next': res_obj['payload']['afterCursor'],
	'film_name': film_name
	}

	def get_all_of_film(url_film):
	results = []
	after_cursor = ''
	film_id = get_id_film(url_film)

	while 1:
	# print('after_cursor: ' + after_cursor)
	res = get_page(film_id, after_cursor)

	# nếu không có comment nào nữa thì thoát
	if len(res['data']) == 0:
	break

	# nếu không thì + vào results và request next page dựa vào after_cursor
	results += res['data']
	after_cursor = res['next']

	return {
	'data': results,
	'film_id': film_id
	}

	def get_film_of_cate(cate, page):
	domain = 'http://phimmoi.net/'
	url_cate_film = 'http://www.phimmoi.net/the-loai/'+cate+'/page-'+str(page)+'.html'

	r = requests.get(url_cate_film)
	resp = r.text
	soup = BeautifulSoup(resp, features="html.parser")

	films = soup.findAll('a', {'class': 'block-wrapper'})

	results = []
	for item in films:
	href = domain + item['href']
	title = item['title'].replace('/', ' ')
	results.append({
	'href': href,
	'title': title
	})

	return results

	cate_film = 'phim-hanh-dong'
	page_film = 1

	list_film = get_film_of_cate(cate_film, page_film)

	for film in list_film:
	url_film = film['href']
	title_film = film['title']

	print(title_film)
	all_data = get_all_of_film(url_film)

	with open('data/' + title_film + '_' + all_data['film_id'] + '.json', 'w') as outfile:
	json.dump(all_data['data'], outfile, indent=4, ensure_ascii=False)
	end_time = time.time()

	total_time = end_time - start_time

	# print(total_time)
No results found