-
-
Save ThinkCode/1119047 to your computer and use it in GitHub Desktop.
Search for Keyword in Text File and return line number
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys | |
import re, MySQLdb | |
db = MySQLdb.connect("localhost","testuser","test123","TESTDB" ) | |
# | |
# MySQL Code | |
# CREATE TABLE keyword (keywordid integer primary key auto_increment, keyword VARCHAR(255) NOT NULL); | |
# | |
# CREATE TABLE location (companyid integer, pdfid integer, keywordid integer references keyword(keywordid), | |
# data TEXT, line integer, span tinyint, createdate timestamp default current_timestamp, | |
# primary key (companyid, pdfid, keywordid, line)); ## Chose this primary key to eliminate duplicates of the same line number and keyword | |
# | |
cursor = db.cursor() | |
def iterwords(fh): | |
for number, line in enumerate(fh): | |
for word in re.split(r'\s+', line.strip()): | |
# Preprocess the words here, for example to strip out punctuation | |
# (the following example is sloooow, compile this regex if you | |
# really want to use it): | |
# | |
# word = re.sub(r'[,.:]', '', word) | |
# | |
word = re.sub(r'[,.:()]', '', word) | |
word = word.lower() | |
yield number, word | |
def search(fh, query): | |
query = re.split(r'\s+', query.strip().lower()) | |
matches = [] | |
words = iterwords(fh) | |
for line, word in words: | |
lines_count = 1 | |
current_line = line | |
for keyword in query: | |
if keyword == word: | |
next_line, word = next(words) | |
if next_line > current_line: | |
lines_count += 1 | |
current_line = next_line | |
else: | |
break | |
else: | |
matches.append((line, lines_count)) | |
return tuple(matches) | |
if __name__ == '__main__': | |
filepath = sys.argv[1] | |
cursor.execute("select keywordid, keyword from keyword") | |
keywordlist = cursor.fetchall() | |
for htmfile in os.listdir(filepath): | |
if htmfile.endswith(".txt"): | |
print "companyid : " + htmfile.split('_')[0], "---> PDF ID " + htmfile.split('_')[1].split('.')[0] | |
companyid = htmfile.split('_')[0] | |
pdfid = htmfile.split('_')[1].split('.')[0] | |
fh = open(filepath + "/" + htmfile) | |
for keyword in keywordlist: | |
matches = search(fh, keyword[1]) | |
fh.seek(0) | |
lines = enumerate(fh) | |
for lineno, linecount in matches: | |
number, line = next(lines) | |
while number < lineno: | |
number, line = next(lines) | |
result_lines = [line] | |
for i in range(linecount-1): | |
result_lines.append(next(lines)[1]) | |
print keyword[0] | |
print "Match found on line {0} (spawning {1} lines):\n > {2}".format( | |
lineno+1, linecount, ' > '.join(result_lines).strip()) | |
cursor.execute("""insert ignore into location (companyid, pdfid, keywordid, data, line, span) values (%s,%s,%s,%s,%s,%s)""", (companyid, pdfid, keyword[0], result_lines[0], lineno+1, linecount)) | |
db.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment