Created
May 18, 2021 17:03
-
-
Save clayote/ff1aed82053863b9c204dccc4aad219d to your computer and use it in GitHub Desktop.
My effort to get useful data out of Twitter profiles
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from requests_html import HTMLSession, HTML | |
from lxml.etree import ParserError | |
import json | |
import time | |
import random | |
with open('following.json', 'rt') as inf: | |
useless = json.load(inf) | |
session = HTMLSession() | |
useful = [] | |
for i, you in enumerate(useless): | |
userid = you['following']['accountId'] | |
# headers are what chromium spits out | |
# I've tried passing in an active cookie as well and it doesn't seem to work | |
headers = { | |
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="90"'' | |
'sec-ch-ua-mobile': '?0', | |
'Upgrade-Insecure-Requests': '1', | |
"User-Agent": "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" | |
} | |
page = session.get(f"https://twitter.com/i/user/{userid}", headers=headers) | |
page.html.render() | |
header = page.html.find('[data-testid="UserProfileHeader_Items"]') | |
# I would do some more processing here but the find call never returns anything! | |
useful.append(header) | |
time.sleep(random.randrange(1, 10)) | |
if i % 100 == 0: | |
print(i) | |
with open('following_detailed.json', 'w') as outf: | |
json.dump(useful, outf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment