Last active
November 22, 2020 21:22
Revisions
-
stevehanson revised this gist
Nov 14, 2013 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,7 +6,8 @@ INDEX = 'test' TYPE = 'attachment' TMP_FILE_NAME = 'tmp.json' # for supported formats, see apache tika - http://tika.apache.org/1.4/formats.html INDEX_FILE_TYPES = ['html','pdf', 'doc', 'docx', 'xls', 'xlsx', 'xml'] def main(): -
stevehanson revised this gist
Nov 14, 2013 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,6 @@ import os import sys # constants, configure to match your environment HOST = 'http://localhost:9200' INDEX = 'test' -
stevehanson created this gist
Nov 14, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,102 @@ import os import sys # constants, configure to match your environment HOST = 'http://localhost:9200' INDEX = 'test' TYPE = 'attachment' TMP_FILE_NAME = 'tmp.json' INDEX_FILE_TYPES = ['html','pdf', 'doc', 'docx', 'xls', 'xlsx', 'jsp', 'xml'] def main(): indexDirectory = raw_input('Index entire directory [Y/n]: ') if not indexDirectory: indexDirectory = 'y' if indexDirectory.lower() == 'y': dir = raw_input('Directory to index (relative to script): ') indexDir(dir) else: fname = raw_input('File to index (relative to script): ') createIndexIfDoesntExist() indexFile(fname) def indexFile(fname): print '\nIndexing ' + fname createEncodedTempFile(fname) postFileToTheIndex() os.remove(TMP_FILE_NAME) print '\n-----------' def indexDir(dir): print 'Indexing dir ' + dir createIndexIfDoesntExist() for path, dirs, files in os.walk(dir): for file in files: fname = os.path.join(path,file) base,extension = file.rsplit('.',1) if extension.lower() in INDEX_FILE_TYPES: indexFile(fname) else: 'Skipping {}, not approved file type: {}'.format(fname, extension) def postFileToTheIndex(): cmd = 'curl -X POST "{}/{}/{}" -d @'.format(HOST,INDEX,TYPE) + TMP_FILE_NAME print cmd os.system(cmd) def createEncodedTempFile(fname): import json file64 = open(fname, "rb").read().encode("base64") print 'writing JSON with base64 encoded file to temp file {}'.format(TMP_FILE_NAME) f = open(TMP_FILE_NAME, 'w') data = { 'file': file64, 'title': fname } json.dump(data, f) # dump json to tmp file f.close() def createIndexIfDoesntExist(): import urllib2 class HeadRequest(urllib2.Request): def get_method(self): return "HEAD" # check if type exists by sending HEAD request to index try: urllib2.urlopen(HeadRequest(HOST + '/' + INDEX + '/' + TYPE)) except urllib2.HTTPError, e: if e.code == 404: print 'Index doesnt exist, creating...' os.system('curl -X PUT "{}/{}/{}/_mapping" -d'.format(HOST,INDEX,TYPE) + ''' '{ "attachment" : { "properties" : { "file" : { "type" : "attachment", "fields" : { "title" : { "store" : "yes" }, "file" : { "term_vector":"with_positions_offsets", "store":"yes" } } } } } }' ''') else: print 'Failed to retrieve index with error code - %s.' % e.code # kick off the main function when script loads main()