Skip to content

Instantly share code, notes, and snippets.

@veesa
Created September 8, 2015 23:18
Show Gist options
  • Save veesa/d79e89581ff3fe314113 to your computer and use it in GitHub Desktop.
Save veesa/d79e89581ff3fe314113 to your computer and use it in GitHub Desktop.
An Apache Log Parser
import argparse
import os
import sys
import re
from collections import Counter
log_list = []
counter = 0
offsite = 0
isOffsite = bool
num_bytes_total = 0
existing_bytes = 0
url_list = []
usage_dict = {}
def get_offsite(log_line):
# Get the number of requests referred from offsite
isOffsite = re.search('example', log_line)
if isOffsite:
return True
else:
return False
parser = argparse.ArgumentParser(description="Directory of customer to be searched.")
parser.add_argument(
"-d", "--dir", required=True, help="Absolute path of the customer directory."
)
args = parser.parse_args()
if not os.path.isdir(args.dir):
print "The directory given does not exist. Exiting program."
sys.exit(1)
for files in os.walk(args.dir):
log_list = files[2]
for log in log_list:
f = open(os.path.join(args.dir, log), 'r')
while True:
line = f.readline().strip('\n')
if not line:
break
counter += 1
'''Get whether the line if referred from offsite or not'''
isOffsite = get_offsite(line)
if isOffsite:
offsite += 1
'''Get the total number of bytes to display at the end'''
line_list = line.split()
num_bytes = line_list[9]
num_bytes_total += int(num_bytes)
'''Build the url list with number of bytes sent for each occurrence'''
if '200' in line_list: # You can just add the other 2XX response numbers here
data = line_list[6]
url_list.append(data)
'''Build the usage data dictionary'''
usage_dict.update({line_list[6]:line_list[9]})
f.close()
percentage = (1.0*offsite/counter) * 100
print "Off-site requests: {} of {} (%{:.2f})".format(offsite, counter, percentage)
templist = Counter(url_list).most_common(10)
print "\nTop 10 URLS:"
for urls in templist:
print '\t' + str(urls[1]) + ' - ' + str(urls[0])
print "\nCustomer usage summary:"
tempdict = {}
for values in usage_dict:
existing_bytes = 0
domain = values.split('/')
if not domain[1] in tempdict:
'''Add the key (stripped domain) and the number of bytes to be the key's value'''
existing_bytes = usage_dict[values]
tempdict.update({domain[1]:existing_bytes})
else:
'''Add the number of bytes to the existing entry'''
existing_bytes = usage_dict[values]
tmp = int(existing_bytes) + int(tempdict[domain[1]])
tempdict.update({domain[1]: str(tmp)})
for entry in tempdict:
t = tempdict[entry]
y = float(t)/1000000000.0
print "\t{:.2f} GB - {}".format(y, entry)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment