Created
July 13, 2017 06:12
-
-
Save JIghtuse/0ad3e3b876a0ad4bc48cef9fedff514b to your computer and use it in GitHub Desktop.
HTTP log parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cctype> | |
#include <cstdlib> | |
#include <fstream> | |
#include <iostream> | |
#include <string> | |
#include <unordered_map> | |
#include <queue> | |
const int kDefaultNumberOfResults = 100; | |
struct Configuration { | |
std::string in_path{}; | |
std::string out_path{}; | |
int n = kDefaultNumberOfResults; | |
}; | |
struct Statistics { | |
std::unordered_map<std::string, int> domains{}; | |
std::unordered_map<std::string, int> paths{}; | |
int urlCount{}; | |
}; | |
Configuration parse_args(int argc, char **argv) | |
{ | |
Configuration config; | |
if (argc == 5) { | |
config.n = std::atoi(argv[2]); | |
config.in_path = argv[3]; | |
config.out_path = argv[4]; | |
} else if (argc == 3) { | |
config.in_path = argv[1]; | |
config.out_path = argv[2]; | |
} | |
return config; | |
} | |
bool is_domain_character(char c) | |
{ | |
return std::isalnum(c) || c == '.' || c == '-'; | |
} | |
bool is_path_character(char c) | |
{ | |
return std::isalnum(c) || c == '/' || c == '.' || c == '_' || c == ',' || c == '+'; | |
} | |
Statistics gatherStats(std::ifstream& in) | |
{ | |
Statistics stat; | |
const char * http = "http"; | |
for (std::string s; std::getline(in, s);) { | |
auto pos = s.find(http); | |
while (pos < s.length() && pos != std::string::npos) { | |
pos += 4; | |
if (s[pos] == 's') { | |
++pos; | |
} | |
if (s.length() <= pos + 3) { | |
break; | |
} | |
if (s[pos] != ':' || s[pos + 1] != '/' || s[pos + 2] != '/') { | |
pos = s.find(http, pos); | |
continue; | |
} | |
pos += 3; | |
std::string domain; | |
while (pos < s.length() && is_domain_character(s[pos])) { | |
domain += s[pos]; | |
++pos; | |
} | |
if (domain.empty()) { | |
pos = s.find(http, pos); | |
continue; | |
} | |
std::string path; | |
while (pos < s.length() && is_path_character(s[pos])) { | |
if (path.empty() && s[pos] != '/') { | |
pos = s.find(http, pos - domain.length()); | |
continue; | |
} | |
path += s[pos]; | |
++pos; | |
} | |
if (path.empty()) { | |
path = "/"; | |
} | |
++stat.domains[domain]; | |
++stat.paths[path]; | |
++stat.urlCount; | |
pos = s.find(http, pos); | |
} | |
} | |
return stat; | |
} | |
void printStat(std::ofstream& out, const char* title, int n, const std::unordered_map<std::string, int>& stat) | |
{ | |
out << title << '\n'; | |
using pair = std::pair<int, std::string>; | |
auto cmp = [](const pair& a, const pair& b) { | |
return a.first < b.first || (a.first == b.first && a.second > b.second); | |
}; | |
auto pq = std::priority_queue<pair, std::vector<pair>, decltype(cmp)>{cmp}; | |
for (const std::pair<std::string, int>& p : stat) { | |
pq.emplace(p.second, p.first); | |
} | |
for (auto i = 0; !pq.empty() && i < n; ++i) { | |
const auto item = pq.top(); | |
out << item.first << ' ' << item.second << '\n'; | |
pq.pop(); | |
} | |
} | |
void printStats(std::ofstream& out, const Configuration& config, const Statistics& stat) | |
{ | |
out << "total urls " << stat.urlCount | |
<< ", domains " << stat.domains.size() | |
<< ", paths " << stat.paths.size() | |
<< "\n\n"; | |
printStat(out, "top domains", config.n, stat.domains); | |
out << '\n'; | |
printStat(out, "top paths", config.n, stat.paths); | |
} | |
int main(int argc, char **argv) | |
{ | |
auto configuration = parse_args(argc, argv); | |
auto in = std::ifstream{configuration.in_path}; | |
if (!in) { | |
std::cerr << "Failed to open input file\n"; | |
return 1; | |
} | |
auto out = std::ofstream{configuration.out_path}; | |
if (!out) { | |
std::cerr << "Failed to open output file\n"; | |
return 1; | |
} | |
printStats(out, configuration, gatherStats(in)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment