Created
May 6, 2017 12:17
-
-
Save georgy7/8a23696ff128df0ed11b0ffe20686964 to your computer and use it in GitHub Desktop.
Статьи с тегами. Тестовые данные для бенчмарков.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.idea | |
.directory | |
data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require 'securerandom' | |
require 'set' | |
require 'json' | |
TAG_COUNT = 10_000 | |
ARTICLE_COUNT = 1_000_000 | |
def randstr | |
SecureRandom.base64.gsub(/\//, '').gsub(/\\/, '').gsub(/\+/, '').gsub(/=/, '') | |
end | |
def genTags | |
tagNameSet = Set.new | |
result_array = [] | |
tags_json = File.open('data/tags.json', 'w') | |
tags_txt = File.open('data/tags.txt', 'w') | |
tags_json.puts('[') | |
1.upto(TAG_COUNT) { |i| | |
# Тут вероятность пересечения выше, чем если бы я не обрезал строки. | |
# Так что я беру тег, и если он уже был, пробую снова, | |
# пока не найду такой, которого еще не было. | |
begin | |
tag = randstr[0..8] | |
end while tagNameSet.include?(tag) | |
tagNameSet.add(tag) | |
result_array.push(tag) | |
tags_txt.puts(tag) | |
tags_json.write(" \"#{tag}\"") | |
if TAG_COUNT == i | |
tags_json.puts | |
else | |
tags_json.puts ',' | |
end | |
} | |
tags_json.puts(']') | |
tags_json.close | |
tags_txt.close | |
result_array | |
end | |
def gen_article_names | |
result_array = [] | |
article_names_json = File.open('data/article_names.json', 'w') | |
article_names_txt = File.open('data/article_names.txt', 'w') | |
article_names_json.puts('[') | |
1.upto(ARTICLE_COUNT) { |i| | |
title = randstr | |
result_array.push(title) | |
article_names_txt.puts(title) | |
article_names_json.write(" \"#{title}\"") | |
if ARTICLE_COUNT == i | |
article_names_json.puts | |
else | |
article_names_json.puts ',' | |
end | |
} | |
article_names_json.puts(']') | |
article_names_json.close | |
article_names_txt.close | |
result_array | |
end | |
def unique_randoms(count, max_exclusively) | |
result = Set.new | |
loop do | |
r = rand(max_exclusively) | |
raise 'bad random (too big)' unless r < max_exclusively | |
result.add(r) | |
return result.to_a if result.size >= count | |
end | |
end | |
def gen_references(tag_names, article_names, maxtagsperpage) | |
references = File.open("data/references_only_#{maxtagsperpage}.json", 'w') | |
articles_with_tags = File.open("data/articles_with_tags_#{maxtagsperpage}.json", 'w') | |
references.puts('[') | |
articles_with_tags.puts('[') | |
article_names.each_with_index { |article, article_index| | |
tags_count = rand(0..maxtagsperpage) | |
articletagindices = unique_randoms(tags_count, tag_names.size) | |
selected_article_names = [] | |
articletagindices.each { |ti| | |
selected_article_names.push(tag_names[ti]) | |
} | |
references.write(" #{article_index}: #{JSON.generate(articletagindices)}") | |
articles_with_tags.write(" {\"title\": \"#{article}\", \"tags\": #{JSON.generate(selected_article_names)}}") | |
if article_index == article_names.size - 1 | |
references.puts | |
articles_with_tags.puts | |
else | |
references.puts ',' | |
articles_with_tags.puts ',' | |
end | |
} | |
references.puts(']') | |
articles_with_tags.puts(']') | |
references.close | |
articles_with_tags.close | |
end | |
def main | |
Dir.mkdir('data') unless Dir.exist?('data') | |
tag_names = genTags | |
article_names = gen_article_names | |
raise 'bad tag_names size' unless tag_names.size == TAG_COUNT | |
raise 'bad article_names size' unless article_names.size == ARTICLE_COUNT | |
raise 'article_names has duplicates' unless article_names.size == article_names.uniq.size | |
# Генерирую три разных независимых набора данных. | |
gen_references(tag_names, article_names, 20) | |
gen_references(tag_names, article_names, 100) | |
gen_references(tag_names, article_names, 1000) | |
end | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment