Created
July 23, 2014 21:00
-
-
Save jedisct1/6f4b835b21f1e6d4b741 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env ruby | |
require 'awesome_print' | |
require 'msgpack' | |
require 'public_suffix' | |
require 'singleton' | |
class DGAScore | |
include Singleton | |
NS = (1..4) | |
NGRAMS_FILE = 'tmp/ngrams' | |
def initialize | |
PublicSuffix::List.private_domains = TRUE | |
@ngrams_chain = [ ] | |
if File.exists?(NGRAMS_FILE) | |
@ngrams_chain = MessagePack.unpack(File.open(NGRAMS_FILE).read) | |
else | |
build_ngrams | |
File.open(NGRAMS_FILE, 'wb').write(MessagePack.pack(@ngrams_chain)) | |
end | |
end | |
def build_ngrams | |
File.open('data/alexa-1m.txt').each_line do |line| | |
line.chomp! | |
line += '.' unless line.end_with?('.') | |
add_training_string(line) | |
end | |
normalize_ngrams_chain | |
end | |
def update_ngram_chain(n, str) | |
@ngrams_chain[n] ||= { } | |
ngram_chain = @ngrams_chain[n] | |
(0..str.length - n).each do |i| | |
ngram1 = str[i, n] | |
ngram_chain[ngram1] ||= 0 | |
ngram_chain[ngram1] += 1 | |
end | |
end | |
def normalize_ngram_chain(n) | |
ngram_chain = @ngrams_chain[n] | |
total = ngram_chain.values.inject(0, :+).to_f | |
ngram_chain.each_pair do |ngram1, count| | |
ngram_chain[ngram1] = count / total | |
end | |
end | |
def normalize_ngrams_chain | |
NS.each { |n| normalize_ngram_chain(n) } | |
end | |
def add_training_string(str) | |
str = str.dup | |
str.downcase! | |
NS.each { |n| update_ngram_chain(n, str) } | |
end | |
def dump | |
ap @ngrams_chain | |
end | |
def score_for_ngram(n, str) | |
return 0.0 if str.length < n | |
ngram_chain = @ngrams_chain[n] | |
score = (0..str.length - n).inject(0.0) do |acc, i| | |
ngram1 = str[i, n] | |
p = ngram_chain[ngram1] || -1.0 | |
acc + p | |
end | |
score / (str.length - n + 1) | |
end | |
def perplexity_for_string(str) | |
str = str.dup | |
str.gsub!(%r{^(www|imap|mail|mx|smtp|ns)-?\d*\.}i, '') | |
str.gsub!(%r{(\.[a-z]{2})\.[a-z]+\.?$}i, '') | |
ns_first, ns_last = 1, 2 | |
return 0.0 if str.length < ns_last | |
sum = 0.0 | |
(0..str.length - ns_last).each do |i| | |
ngram1 = str[i, ns_last] | |
p_w2_w1 = @ngrams_chain[ns_last][ngram1] || 1e-6 | |
(ns_last - 1).downto(ns_first) do |n| | |
ngram1 = str[i, n] | |
p_w2_w1 /= @ngrams_chain[n][ngram1] || 1e-6 | |
end | |
sum += Math.log(p_w2_w1) | |
end | |
perplexity = Math.exp(-sum / (str.length - ns_last + 1)) | |
perplexity = (perplexity - 3.22) * 100.0 / 2006.8 | |
[100.0, [perplexity, 0.0].max].min | |
end | |
def score_for_string(str) | |
str += '.' unless str.end_with?('.') | |
total_weight = 0 | |
score = NS.inject(0.0) do |acc, n| | |
weight = n | |
total_weight += weight | |
acc + score_for_ngram(n, str) * weight | |
end | |
score /= total_weight | |
score = score * -100.0 / 0.303 | |
score = [100.0, [0.0, score].max].min | |
end | |
def entropy_for_string(str) | |
begin | |
str = PublicSuffix.parse(str).sld | |
rescue Exception | |
end | |
b, len = str.bytes.to_a, str.length.to_f | |
b.uniq.inject(0.0) do |acc, c| | |
x = b.count(c) / len | |
acc + (x > 0.0 ? - x * Math.log2(x) : 0.0) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment