Last active
December 28, 2021 10:21
-
-
Save sameerkumar18/6f54b2064237b1614e21965a7b236166 to your computer and use it in GitHub Desktop.
Export Wix Blogs to CSV - No API needed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
responses = [] | |
WIX_SITE_URL = 'https://www.YOUR WIX SITE.com' | |
import xmltodict | |
import requests | |
import json | |
import csv | |
def get_blog_posts_urls(): | |
url = f'{WIX_SITE_URL}/blog-posts-sitemap.xml' | |
res = requests.get(url) | |
raw = xmltodict.parse(res.text) | |
blog_urls = [url['loc'] for url in dict(raw)['urlset']['url']] | |
print(blog_urls) | |
return blog_urls | |
blog_urls = get_blog_posts_urls() | |
def _remove_attrs(soup): | |
for tag in soup.find_all(True): | |
attrs = dict(tag.attrs) | |
for attr in attrs: | |
if 'data-hook' not in attr and 'data-id' not in attr and 'src' not in attr and 'href' not in attr: | |
del tag.attrs[attr] | |
elif (attrs.get('type') and attrs['type'] == 'empty-line') or (attrs.get('data-hook') and 'rcv-' in attrs['data-hook']): | |
tag.extract() | |
return soup | |
from bs4 import BeautifulSoup | |
import time | |
for URL in blog_urls: | |
time.sleep(1) | |
page = requests.get(URL) | |
soup = BeautifulSoup(page.content, "html.parser") | |
post_thumbnail = soup.find_all(attrs={"property": "og:image"}) | |
post_thumbnail_url = post_thumbnail[0]['content'] | |
post_description = soup.find_all(attrs={"property": "og:description"}) | |
post_description_text = post_description[0]['content'] | |
soup = _remove_attrs(soup) | |
# print(soup) | |
post_title = soup.find_all(attrs={"data-hook": "post-title"}) | |
post_title_text = post_title[0].text | |
post_author = soup.find_all(attrs={"data-hook": "user-name"}) | |
post_author_text = post_author[0].text | |
post_date = soup.find_all(attrs={"data-hook": "time-ago"}) | |
post_date_text = post_date[0].text | |
post_content = soup.find_all(attrs={"data-id": "rich-content-viewer"}) | |
post_content_html = str(post_content[0]) | |
post_categories = soup.find_all(attrs={"data-hook": "category-label-list__item"}) | |
post_categories = ','.join([category.text for category in post_categories]) | |
print(post_author_text) | |
print(post_date_text) | |
responses.append({ | |
'post_title': post_title_text, | |
'post_author': post_author_text, | |
'post_date': post_date_text, | |
'post_content_html': post_content_html, | |
'post_thumbnail': post_thumbnail_url, | |
'post_description': post_description_text, | |
'post_url': URL, | |
'post_categories': post_categories | |
}) | |
print(responses[0]) | |
keys = responses[0].keys() | |
with open('data.csv', 'w', newline='') as output_file: | |
dict_writer = csv.DictWriter(output_file, keys) | |
dict_writer.writeheader() | |
dict_writer.writerows(responses) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment