Created
September 8, 2015 23:18
-
-
Save veesa/d79e89581ff3fe314113 to your computer and use it in GitHub Desktop.
An Apache Log Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
import sys | |
import re | |
from collections import Counter | |
log_list = [] | |
counter = 0 | |
offsite = 0 | |
isOffsite = bool | |
num_bytes_total = 0 | |
existing_bytes = 0 | |
url_list = [] | |
usage_dict = {} | |
def get_offsite(log_line): | |
# Get the number of requests referred from offsite | |
isOffsite = re.search('example', log_line) | |
if isOffsite: | |
return True | |
else: | |
return False | |
parser = argparse.ArgumentParser(description="Directory of customer to be searched.") | |
parser.add_argument( | |
"-d", "--dir", required=True, help="Absolute path of the customer directory." | |
) | |
args = parser.parse_args() | |
if not os.path.isdir(args.dir): | |
print "The directory given does not exist. Exiting program." | |
sys.exit(1) | |
for files in os.walk(args.dir): | |
log_list = files[2] | |
for log in log_list: | |
f = open(os.path.join(args.dir, log), 'r') | |
while True: | |
line = f.readline().strip('\n') | |
if not line: | |
break | |
counter += 1 | |
'''Get whether the line if referred from offsite or not''' | |
isOffsite = get_offsite(line) | |
if isOffsite: | |
offsite += 1 | |
'''Get the total number of bytes to display at the end''' | |
line_list = line.split() | |
num_bytes = line_list[9] | |
num_bytes_total += int(num_bytes) | |
'''Build the url list with number of bytes sent for each occurrence''' | |
if '200' in line_list: # You can just add the other 2XX response numbers here | |
data = line_list[6] | |
url_list.append(data) | |
'''Build the usage data dictionary''' | |
usage_dict.update({line_list[6]:line_list[9]}) | |
f.close() | |
percentage = (1.0*offsite/counter) * 100 | |
print "Off-site requests: {} of {} (%{:.2f})".format(offsite, counter, percentage) | |
templist = Counter(url_list).most_common(10) | |
print "\nTop 10 URLS:" | |
for urls in templist: | |
print '\t' + str(urls[1]) + ' - ' + str(urls[0]) | |
print "\nCustomer usage summary:" | |
tempdict = {} | |
for values in usage_dict: | |
existing_bytes = 0 | |
domain = values.split('/') | |
if not domain[1] in tempdict: | |
'''Add the key (stripped domain) and the number of bytes to be the key's value''' | |
existing_bytes = usage_dict[values] | |
tempdict.update({domain[1]:existing_bytes}) | |
else: | |
'''Add the number of bytes to the existing entry''' | |
existing_bytes = usage_dict[values] | |
tmp = int(existing_bytes) + int(tempdict[domain[1]]) | |
tempdict.update({domain[1]: str(tmp)}) | |
for entry in tempdict: | |
t = tempdict[entry] | |
y = float(t)/1000000000.0 | |
print "\t{:.2f} GB - {}".format(y, entry) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment