fipski · August 1, 2020 13:13
diff --git a/escape_html_body_to_json.py b/escape_html_body_to_json.py
 #!/usr/bin/python3
 import sys
 import json
 import html
 from escapejson import escapejson
 from lxml import etree
 import lxml.html
 from io import StringIO, BytesIO

 with open(str(sys.argv[1]),'r') as f:
    ret = f.read()

 # print(html)
 # root = lxml.html.fromstring(html)
 parser = etree.HTMLParser()
 tree = etree.parse(StringIO(ret), parser)
 root = tree.getroot()
 ret = etree.tostring(root.find('body')).decode('utf-8')
 start = ret.find('>') +1
 end = ret.find('</body>')
 ret = ret[start:end]
 ret = html.unescape(ret)
 ret = json.dumps(ret)
 ret = escapejson(ret)

 print(ret)
	#!/usr/bin/python3
	import sys
	import json
	import html
	from escapejson import escapejson
	from lxml import etree
	import lxml.html
	from io import StringIO, BytesIO

	with open(str(sys.argv[1]),'r') as f:
	ret = f.read()

	# print(html)
	# root = lxml.html.fromstring(html)
	parser = etree.HTMLParser()
	tree = etree.parse(StringIO(ret), parser)
	root = tree.getroot()
	ret = etree.tostring(root.find('body')).decode('utf-8')
	start = ret.find('>') +1
	end = ret.find('</body>')
	ret = ret[start:end]
	ret = html.unescape(ret)
	ret = json.dumps(ret)
	ret = escapejson(ret)

	print(ret)