sameerkumar18 · December 28, 2021 10:21
diff --git a/wix_blogs_scraper.py b/wix_blogs_scraper.py
 responses = []

 WIX_SITE_URL = 'https://www.YOUR WIX SITE.com'

 import xmltodict
 import requests
 import json
 import csv

 def get_blog_posts_urls():
  url = f'{WIX_SITE_URL}/blog-posts-sitemap.xml'
  res = requests.get(url)
  raw = xmltodict.parse(res.text)
  blog_urls = [url['loc'] for url in dict(raw)['urlset']['url']]
  
  print(blog_urls)
  return blog_urls

 blog_urls = get_blog_posts_urls()

 def _remove_attrs(soup):
    for tag in soup.find_all(True):
        attrs = dict(tag.attrs)
        for attr in attrs:
            if 'data-hook' not in attr and 'data-id' not in attr and 'src' not in attr and 'href' not in attr:
              del tag.attrs[attr]
            elif (attrs.get('type') and attrs['type'] == 'empty-line') or (attrs.get('data-hook') and 'rcv-' in attrs['data-hook']):
              tag.extract()
    return soup


 from bs4 import BeautifulSoup
 import time
 for URL in blog_urls:
    time.sleep(1)
    
    page = requests.get(URL)

    soup = BeautifulSoup(page.content, "html.parser")

    post_thumbnail = soup.find_all(attrs={"property": "og:image"})
    post_thumbnail_url = post_thumbnail[0]['content']

    post_description = soup.find_all(attrs={"property": "og:description"})
    post_description_text = post_description[0]['content']
    soup = _remove_attrs(soup)
    # print(soup)

    post_title = soup.find_all(attrs={"data-hook": "post-title"})
    post_title_text = post_title[0].text

    post_author = soup.find_all(attrs={"data-hook": "user-name"})
    post_author_text = post_author[0].text

    post_date = soup.find_all(attrs={"data-hook": "time-ago"})
    post_date_text = post_date[0].text

    post_content = soup.find_all(attrs={"data-id": "rich-content-viewer"})
    post_content_html = str(post_content[0])
    
    post_categories = soup.find_all(attrs={"data-hook": "category-label-list__item"})
    post_categories = ','.join([category.text for category in post_categories])
    print(post_author_text)
    print(post_date_text)

    responses.append({
        'post_title': post_title_text,
        'post_author': post_author_text,
        'post_date': post_date_text,
        'post_content_html': post_content_html,
        'post_thumbnail': post_thumbnail_url,
        'post_description': post_description_text,
        'post_url': URL,
        'post_categories': post_categories
    })
 print(responses[0])

 keys = responses[0].keys()

 with open('data.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(responses)
	responses = []

	WIX_SITE_URL = 'https://www.YOUR WIX SITE.com'

	import xmltodict
	import requests
	import json
	import csv

	def get_blog_posts_urls():
	url = f'{WIX_SITE_URL}/blog-posts-sitemap.xml'
	res = requests.get(url)
	raw = xmltodict.parse(res.text)
	blog_urls = [url['loc'] for url in dict(raw)['urlset']['url']]

	print(blog_urls)
	return blog_urls

	blog_urls = get_blog_posts_urls()

	def _remove_attrs(soup):
	for tag in soup.find_all(True):
	attrs = dict(tag.attrs)
	for attr in attrs:
	if 'data-hook' not in attr and 'data-id' not in attr and 'src' not in attr and 'href' not in attr:
	del tag.attrs[attr]
	elif (attrs.get('type') and attrs['type'] == 'empty-line') or (attrs.get('data-hook') and 'rcv-' in attrs['data-hook']):
	tag.extract()
	return soup


	from bs4 import BeautifulSoup
	import time
	for URL in blog_urls:
	time.sleep(1)

	page = requests.get(URL)

	soup = BeautifulSoup(page.content, "html.parser")

	post_thumbnail = soup.find_all(attrs={"property": "og:image"})
	post_thumbnail_url = post_thumbnail[0]['content']

	post_description = soup.find_all(attrs={"property": "og:description"})
	post_description_text = post_description[0]['content']
	soup = _remove_attrs(soup)
	# print(soup)

	post_title = soup.find_all(attrs={"data-hook": "post-title"})
	post_title_text = post_title[0].text

	post_author = soup.find_all(attrs={"data-hook": "user-name"})
	post_author_text = post_author[0].text

	post_date = soup.find_all(attrs={"data-hook": "time-ago"})
	post_date_text = post_date[0].text

	post_content = soup.find_all(attrs={"data-id": "rich-content-viewer"})
	post_content_html = str(post_content[0])

	post_categories = soup.find_all(attrs={"data-hook": "category-label-list__item"})
	post_categories = ','.join([category.text for category in post_categories])
	print(post_author_text)
	print(post_date_text)

	responses.append({
	'post_title': post_title_text,
	'post_author': post_author_text,
	'post_date': post_date_text,
	'post_content_html': post_content_html,
	'post_thumbnail': post_thumbnail_url,
	'post_description': post_description_text,
	'post_url': URL,
	'post_categories': post_categories
	})
	print(responses[0])

	keys = responses[0].keys()

	with open('data.csv', 'w', newline='') as output_file:
	dict_writer = csv.DictWriter(output_file, keys)
	dict_writer.writeheader()
	dict_writer.writerows(responses)