Skip to content

Instantly share code, notes, and snippets.

@fipski
Created August 1, 2020 13:13
Show Gist options
  • Save fipski/a2c4b6bc7a9cb9687cf2ef85e8d959d8 to your computer and use it in GitHub Desktop.
Save fipski/a2c4b6bc7a9cb9687cf2ef85e8d959d8 to your computer and use it in GitHub Desktop.
read a html file and print body content as json escaped string including html
#!/usr/bin/python3
import sys
import json
import html
from escapejson import escapejson
from lxml import etree
import lxml.html
from io import StringIO, BytesIO
with open(str(sys.argv[1]),'r') as f:
ret = f.read()
# print(html)
# root = lxml.html.fromstring(html)
parser = etree.HTMLParser()
tree = etree.parse(StringIO(ret), parser)
root = tree.getroot()
ret = etree.tostring(root.find('body')).decode('utf-8')
start = ret.find('>') +1
end = ret.find('</body>')
ret = ret[start:end]
ret = html.unescape(ret)
ret = json.dumps(ret)
ret = escapejson(ret)
print(ret)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment