Created
June 19, 2022 07:35
-
-
Save south1907/85b054334a96f122ad9d311678bfd22f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import openpyxl | |
import xlsxwriter | |
def get_row_data(link_detail): | |
payload={} | |
headers = { | |
'authority': 'www.producthunt.com', | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', | |
'accept-language': 'en-US,en;q=0.9', | |
'cache-control': 'max-age=0', | |
'cookie': 'first_visit=1654929088; first_referer=; ajs_anonymous_id=%22628897fa-3abb-4142-bc97-5cd8498b34d0%22; _delighted_web={%2271AaKmxD4TpPsjYW%22:{%22_delighted_fst%22:{%22t%22:%221654929088226%22}}}; _ga=GA1.2.1194507260.1654929088; _gid=GA1.2.1865172956.1654929088; visitor_id=79f8abb7-0d16-4544-b8cd-cd17ba70011d; track_code=8bc55d00b2; g_state={"i_p":1654936304525,"i_l":1}; csrf_token=ZQwohUveJ5pQwUtACpEyeMIb8yrRNb0omt7ribwddN58IaRx%2Fuj1PwpOpcAFsLTiiY7JfpcXStLqGRfumURwBA%3D%3D; _producthunt_session_production=kj5xAcTy%2FmoPvSGXsd9UmPtrC%2FybCFW61K1tqUGjjKafJw8G9YheayzD3V8aK2iOPErZtQ9cpsWHo755EKcqTzcXObpTmRXRsiS0zNYTFLMzq4dEJUukzusX408M7E4AN8XceFf85%2FiHO7JlucuQ7cRCIwIAlWqjUOSCYId4qbVpAwFWsix%2FkQJcIzj1cr2z9LF6XGzpE68nfyv3%2BG9ockMl%2B8Mib9u6u7qIAVcIt9AAOuFlSDnnbTxXfD9AQvIhMmOLhE%2Fv9UDTT9bEq%2Bb4maqe3iOt71hj1FMAYzShlqq9h57vq46lfp9MTQGVg%2FRoPHaziz1QVEwWz%2BsaZnwhlMiF7qCFdJSYadt7ptFpGHPVJfYsmmgipw0v%2F%2ByPhOCDVhFsUS2yKzFRVu9D7rjqaL4%2FSNNBczF%2FDpTVBJIHrqcOcyt%2FWN4RuFThgIxCYAwgLay%2FtTxpYWfvOGbSKFFnV2WEPWoLJYZOE7OmGA0%3D--WNYtOdTtEikObi19--tmXOs9r71CxPF6Y%2BddojTQ%3D%3D; _producthunt_session_production=UVTrsQELl53GdfwBanr%2B9qVeZ6xSiS44TtBaYGR6ve8rxhlvOB9qUO58ZcrYGdopr7Mb5gLgxDLcxIbJkRBfrR1vVYGuPLJXWMMdshYRlpGWXYypyy8yiytU3oY7Fz6LsdgTiqOCOUqjkF6nSRpXZtswNPWbHo3GC3lIDG0eIqc5tMnXWQOQ5sPMr55%2BwejtvmL6EZXin%2B5zar62ZRmZLFvYQqt8k83dHCf2C15GCPD6ONwlJOuSw%2B5ZrsV3vZP3MkYRGdaNQIC4gp6W5ZBSj0SSRbXX7zPtppdYVsUHb%2FwgNj67SfSeRolCPl94Wa%2BOWHFQz%2BquIIChV17cHXxK1%2BttSOYL8JcVFdFhW59GKlorU7RVrA%3D%3D--dbyTmU4YCrNTGqff--lq0LMN%2BR8ZFFEG4bYazTrQ%3D%3D; csrf_token=61q0Wbq0bU0RIWWKlZ0EUcAsI3p6f%2BwQBVEWpDB4FSTydzitD4K%2F6EuuiwqavILLi7kZLjxdG%2Bp1lurDFSER%2Fg%3D%3D', | |
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"', | |
'sec-fetch-dest': 'document', | |
'sec-fetch-mode': 'navigate', | |
'sec-fetch-site': 'same-origin', | |
'sec-fetch-user': '?1', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36' | |
} | |
response = requests.request("GET", link_detail, headers=headers, data=payload) | |
text_html = response.text | |
soup = BeautifulSoup(text_html, 'html.parser') | |
find_infos = soup.find_all('div', {'class':'style_flex___KlcI style_direction-row__oinjH style_flex-row-gap-4__izJPT style_mt-4__uHhPT'}) | |
# print(find_infos) | |
find_infos = find_infos[0] | |
infos = find_infos.find_all('div', {'class': 'style_flex___KlcI style_direction-row__oinjH style_flex-row-gap-1__VY472'}) | |
data_file = [] | |
result = [] | |
if len(infos) == 3: | |
for info in infos: | |
result.append(info.find('div').getText()) | |
else: | |
result.extend(['', '', '']) | |
# link website | |
find_website = soup.find('a', {'class':'styles_reset__opz7w styles_button__zKntg styles_secondary__aa4sx'}) | |
if find_website: | |
result.append(find_website['href']) | |
else: | |
result.append('') | |
find_description = soup.find_all('div', {'class': 'style_color-dark-grey__aN5DV style_fontSize-16__DCrgA style_fontWeight-400__5p97M'}) | |
if find_description and len(find_description) > 0: | |
result.append(find_description[-1].getText()) | |
else: | |
result.append('') | |
return result | |
def get_page_by_cursor(cursor): | |
url = "https://www.producthunt.com/frontend/graphql" | |
payload = json.dumps({ | |
"operationName": "TopicPage", | |
"variables": { | |
"slug": "productivity", | |
"order": "most-upvoted", | |
"cursor": cursor, | |
"query": None, | |
"topPostsVariant": "THIS_WEEK", | |
"includeLayout": False | |
}, | |
"query": "query TopicPage($slug:String!$cursor:String$query:String$subtopic:ID$order:String$topPostsVariant:TopPostsCardVariant!){topic(slug:$slug){id slug parent{id name slug __typename}...MetaTags ...TopicPageHeaderFragment ...TopicPagePostListFragment relatedAd(kind:\"feed\"){...AdFragment __typename}relatedTopics(limit:3){id ...RelatedTopicsSidebarCardFragment __typename}__typename}stories(first:3 order:TRENDING){edges{node{id ...StoriesSidebarCardFragment __typename}__typename}__typename}...TopPostsSidebarCardFragment}fragment TopPostsSidebarCardFragment on Query{postsTop(preferredVariant:$topPostsVariant){variant posts{id name slug tagline ...PostThumbnail __typename}__typename}__typename}fragment PostThumbnail on Post{id name thumbnailImageUuid ...PostStatusIcons __typename}fragment PostStatusIcons on Post{id name productState __typename}fragment StoriesSidebarCardFragment on AnthologiesStory{id slug title headerImageUuid minsToRead __typename}fragment RelatedTopicsSidebarCardFragment on Topic{id slug name imageUuid description __typename}fragment MetaTags on SEOInterface{id meta{canonicalUrl creator description image mobileAppUrl oembedUrl robots title type author authorUrl __typename}__typename}fragment AdFragment on AdChannel{id post{id slug name updatedAt commentsCount ...PostVoteButtonFragment __typename}ctaText dealText name tagline thumbnailUuid url __typename}fragment PostVoteButtonFragment on Post{id featuredAt updatedAt createdAt disabledWhenScheduled hasVoted ...on Votable{id votesCount __typename}__typename}fragment TopicPageHeaderFragment on Topic{id name description parent{id name slug __typename}...TopicFollowButton ...FacebookShareButtonFragment topPosts:posts(first:3 order:\"most-upvoted\"){edges{node{id name slug ...PostThumbnail __typename}__typename}__typename}__typename}fragment TopicFollowButton on Topic{id slug name isFollowed followersCount ...TopicImage __typename}fragment TopicImage on Topic{name imageUuid __typename}fragment FacebookShareButtonFragment on Shareable{id url __typename}fragment TopicPagePostListFragment on Topic{name slug posts(first:20 after:$cursor query:$query subtopic:$subtopic order:$order){edges{node{id ...PostItem ...TopicPageReviewRatingFragment __typename}__typename}pageInfo{endCursor hasNextPage __typename}__typename}__typename}fragment PostItem on Post{id commentsCount name shortenedUrl slug tagline updatedAt pricingType topics(first:1){edges{node{id name slug __typename}__typename}__typename}redirectToProduct{id slug __typename}...PostThumbnail ...PostVoteButton __typename}fragment PostVoteButton on Post{id featuredAt updatedAt createdAt disabledWhenScheduled hasVoted ...on Votable{id votesCount __typename}__typename}fragment TopicPageReviewRatingFragment on Post{id reviewsWithBodyCount product{id slug __typename}__typename}" | |
}) | |
headers = { | |
'authority': 'www.producthunt.com', | |
'accept': '*/*', | |
'accept-language': 'en-US,en;q=0.9', | |
'content-type': 'application/json', | |
'cookie': 'first_visit=1654929088; first_referer=; ajs_anonymous_id=%22628897fa-3abb-4142-bc97-5cd8498b34d0%22; _delighted_web={%2271AaKmxD4TpPsjYW%22:{%22_delighted_fst%22:{%22t%22:%221654929088226%22}}}; _ga=GA1.2.1194507260.1654929088; _gid=GA1.2.1865172956.1654929088; visitor_id=79f8abb7-0d16-4544-b8cd-cd17ba70011d; track_code=8bc55d00b2; g_state={"i_p":1654936304525,"i_l":1}; _gat=1; csrf_token=fD5%2BecIhhGYELtY2m4PPz0X6Jk7kLlBIYgrbwIwDoKVlE%2FKNdxdWw16hOLaUoklVDm8cGqIMp7ISzSenqVqkfw%3D%3D; _producthunt_session_production=CvQI14XphpKK46l3AB7ixRPz4CpCiM4NLPUW%2B2xKSnxK6jFYiB%2FTxlt%2BUdGa6wcsfouFAwJC2GJDomq1ljkmI0vA1FEVC9YzLvHhcvCP41fEcRwmjipl%2F22h6DdUMTWpWppyS7L8qU4mNJ8GNDbX5waAHB7JzHItkdjZeyNV4qAdwZgJKQa9WU0JXyJ2%2FhKBjy16ylp9xjwaatsTM7aCbae%2BF9ziWFUO3nWywoOOUVdY5HYfktorH0OqHTCsYIw1%2BK8KP50ZNh8TleykOO59J6QyRzISPwVO1KB22SfQY%2BGE6CZeqU106wWRBH9feAOWWy5fedaS%2BY%2FF0r9m458IUmai9DmNYIwnFun5HU0%3D--%2FAzoSXAwPi9f7aho--JivoTF0rZbF2cL84JxTs5w%3D%3D; _producthunt_session_production=UVTrsQELl53GdfwBanr%2B9qVeZ6xSiS44TtBaYGR6ve8rxhlvOB9qUO58ZcrYGdopr7Mb5gLgxDLcxIbJkRBfrR1vVYGuPLJXWMMdshYRlpGWXYypyy8yiytU3oY7Fz6LsdgTiqOCOUqjkF6nSRpXZtswNPWbHo3GC3lIDG0eIqc5tMnXWQOQ5sPMr55%2BwejtvmL6EZXin%2B5zar62ZRmZLFvYQqt8k83dHCf2C15GCPD6ONwlJOuSw%2B5ZrsV3vZP3MkYRGdaNQIC4gp6W5ZBSj0SSRbXX7zPtppdYVsUHb%2FwgNj67SfSeRolCPl94Wa%2BOWHFQz%2BquIIChV17cHXxK1%2BttSOYL8JcVFdFhW59GKlorU7RVrA%3D%3D--dbyTmU4YCrNTGqff--lq0LMN%2BR8ZFFEG4bYazTrQ%3D%3D; csrf_token=61q0Wbq0bU0RIWWKlZ0EUcAsI3p6f%2BwQBVEWpDB4FSTydzitD4K%2F6EuuiwqavILLi7kZLjxdG%2Bp1lurDFSER%2Fg%3D%3D', | |
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjE4NjQxMTMiLCJhcCI6IjU5NDMzNzgyMiIsImlkIjoiM2RlNGZiYTgxMTQwYjBlMyIsInRyIjoiMTdlYTQ3YmNhYzU2MTA0OWIxZWUwOTdmZGRiZDA4ZDciLCJ0aSI6MTY1NDkzODIyMTQyMn19', | |
'origin': 'https://www.producthunt.com', | |
'referer': 'https://www.producthunt.com/topics/productivity', | |
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"', | |
'sec-fetch-dest': 'empty', | |
'sec-fetch-mode': 'cors', | |
'sec-fetch-site': 'same-origin', | |
'traceparent': '00-17ea47bcac561049b1ee097fddbd08d7-3de4fba81140b0e3-01', | |
'tracestate': '1864113@nr=0-1-1864113-594337822-3de4fba81140b0e3----1654938221422', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36', | |
'x-requested-with': 'XMLHttpRequest' | |
} | |
response = requests.request("POST", url, headers=headers, data=payload) | |
return json.loads(response.text) | |
data_file = [] | |
current_cursor = '' | |
for i in range(0,10000): | |
print('page: ' + str(i)) | |
data = get_page_by_cursor(current_cursor) | |
topics = data['data']['topic']['posts'] | |
edges = topics['edges'] | |
page_info = topics['pageInfo'] | |
has_next = page_info['hasNextPage'] | |
if has_next == False: | |
print('break by Done page') | |
break | |
current_cursor = page_info['endCursor'] | |
print(page_info) | |
for edge in edges: | |
edge = edge['node'] | |
try: | |
if 'product' not in edge or edge['product'] is None: | |
continue | |
row = [edge['name'], edge['product']['slug'], edge['tagline'], edge['votesCount']] | |
link_detail = 'https://www.producthunt.com/products/' + edge['product']['slug'] | |
print(link_detail) | |
row.append(link_detail) | |
row.extend(get_row_data(link_detail)) | |
data_file.append(row) | |
except Exception as e: | |
print(edge) | |
# raise | |
# break | |
df = pd.DataFrame(data_file, columns=['name', 'slug', 'tagline', 'votesCount', 'link_origin', 'upvotes', 'launches', 'followers', 'website', 'bio']) | |
df.to_excel('result.xlsx', engine='xlsxwriter') | |
df = pd.DataFrame(data_file, columns=['name', 'slug', 'tagline', 'votesCount', 'link_origin', 'upvotes', 'launches', 'followers', 'website', 'bio']) | |
df.to_excel('result.xlsx', engine='xlsxwriter') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment