Last active
January 8, 2016 13:42
-
-
Save 1844144/96a34b792da1d1f3a7cc to your computer and use it in GitHub Desktop.
Add TM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import subprocess | |
import os | |
import sys | |
import BaseHTTPServer | |
from SimpleHTTPServer import SimpleHTTPRequestHandler | |
import requests | |
from lxml import etree | |
from StringIO import StringIO | |
def add_tm(text): | |
""" copy text char by char and add tm | |
if we counted exactly 6 chars """ | |
count = 0 | |
out = "" | |
for char in text: | |
if char.isalpha(): | |
count += 1 | |
else: | |
if count == 6: | |
out += u'\u2122' | |
count = 0 | |
out += char | |
if count == 6: | |
out += u'\u2122' | |
return out | |
def process_line(line, encoding): | |
""" process html page, text are processed on 'tag end' events, | |
skipping <script> tags """ | |
allowed = True | |
tree = etree.iterparse(StringIO(line), events=('start', 'end'), html=True) | |
for e, data in tree: | |
if e == 'start' and data.tag == 'script': | |
allowed = False | |
if e == 'end': | |
if allowed: | |
if data.text and len(data.text) >= 6: | |
data.text = add_tm(data.text) | |
# tail is text after tag closing (before next tag) | |
if data.tail and len(data.tail) >= 6: | |
data.tail = add_tm(data.tail) | |
if data.tag == 'script': | |
allowed = True | |
return etree.tostring(tree.root, method='html', encoding=encoding) | |
class MyHttpHandler(SimpleHTTPRequestHandler): | |
site = '' | |
def do_GET(self): | |
"""Serve a GET request.""" | |
r = requests.get('http://'+self.site+self.path, stream=True) | |
content_type = r.headers['content-type'] | |
tmp = StringIO() | |
if 'text/html' in content_type: | |
tmp.write(process_line(r.content, r.encoding)) | |
else: | |
tmp.write(r.content) | |
length = tmp.tell() | |
tmp.seek(0) | |
# set needed headers | |
self.send_response(r.status_code) | |
self.send_header('content-length', length) | |
self.send_header('content-type', content_type) | |
self.end_headers() | |
self.copyfile(tmp, self.wfile) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
description=u'Add \u2122 to each 6-character word on page.') | |
parser.add_argument('-p', '--port', default=8000, | |
type=int, help='local port, 8000') | |
parser.add_argument('-H', '--host', default='localhost', | |
type=str, help='hostname, localhost') | |
parser.add_argument('-s', '--site', default='habrahabr.ru', | |
type=str, help='site (without http://) , habrahabr.ru') | |
args = parser.parse_args() | |
print "Launching proxy on {}:{}, opening {}".format( | |
args.host, args.port, args.site) | |
if args.site == 'habrahabr.ru': | |
page = 'company/yandex/blog/258673' | |
else: | |
page = '' | |
subprocess.Popen(['xdg-open', 'http://{}:{}/{}'.format( | |
args.host, args.port, page)]) | |
MyHttpHandler.site = args.site | |
BaseHTTPServer.HTTPServer( | |
(args.host, args.port), MyHttpHandler).serve_forever() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment