Last active
June 3, 2019 15:53
-
-
Save leisurelicht/d7d0005abdf8b743f90bc99ba35ac0d2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
面试题 | |
Q:有一个大文件日志,日志内容包含 访问时间 和 访问 IP,问如何统计每分钟访问次数超过 100 次的 IP ? | |
访问日志文件内的数据都是根据时间有序排列的。所以只要逐行处理,把秒去掉,然后利用字典统计每分钟内的IP访问次数。 | |
到下一分钟了就把字典清空,重新统计就可以了。 | |
""" | |
import random | |
from datetime import datetime, timedelta | |
def genlog(): | |
ips = [] | |
for _ in range(50): | |
ips.append(".".join([str(random.randint(0, 255)),str(random.randint(0, 255)),str(random.randint(0, 255)),str(random.randint(0, 255))])) | |
f = open("sample.log", "w") | |
for minute in range(59, -1, -1): | |
for i in range(0, 60): | |
second = 59 - i | |
for i in range(random.randint(0,150)): | |
tmp = datetime(2019, 5, 19, 0, minute, second) | |
f.write("{}\t{}\n".format(tmp.strftime("%Y-%m-%d %H:%M:%S"), random.choice(ips))) | |
f.close() | |
def main(): | |
current = None | |
container = {} | |
exceed = set() | |
with open("sample.log", "r") as f: | |
for i in f: | |
ds, ip = i.strip().split("\t") # 逐行读取后把时间和ip取出来 | |
minute = ds[:16] # 只保留分钟级的精度 | |
if minute != current: # 统计到下一分钟了,就把访问超过100次的ip挑出来,然后把字典清空 | |
for ip, count in container.items(): | |
if count >= 100: | |
exceed.add(ip) | |
current = minute | |
container = {} | |
try: | |
container[ip] += 1 | |
except: | |
container[ip] = 1 | |
return exceed | |
if __name__ == "__main__": | |
genlog() | |
# print("log gen over") | |
print(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment