georgy7 · May 6, 2017 12:17
diff --git a/.gitignore b/.gitignore
 .idea
 .directory
 data
diff --git a/gendata.rb b/gendata.rb
 #!/usr/bin/env ruby
 # encoding: utf-8

 require 'securerandom'
 require 'set'
 require 'json'

 TAG_COUNT = 10_000
 ARTICLE_COUNT = 1_000_000

 def randstr
  SecureRandom.base64.gsub(/\//, '').gsub(/\\/, '').gsub(/\+/, '').gsub(/=/, '')
 end

 def genTags
  tagNameSet = Set.new
  result_array = []

  tags_json = File.open('data/tags.json', 'w')
  tags_txt = File.open('data/tags.txt', 'w')

  tags_json.puts('[')

  1.upto(TAG_COUNT) { |i|

    # Тут вероятность пересечения выше, чем если бы я не обрезал строки.
    # Так что я беру тег, и если он уже был, пробую снова,
    # пока не найду такой, которого еще не было.
    begin
      tag = randstr[0..8]
    end while tagNameSet.include?(tag)

    tagNameSet.add(tag)
    result_array.push(tag)

    tags_txt.puts(tag)
    tags_json.write("  \"#{tag}\"")
    if TAG_COUNT == i
      tags_json.puts
    else
      tags_json.puts ','
    end
  }

  tags_json.puts(']')

  tags_json.close
  tags_txt.close
  result_array
 end

 def gen_article_names
  result_array = []

  article_names_json = File.open('data/article_names.json', 'w')
  article_names_txt = File.open('data/article_names.txt', 'w')

  article_names_json.puts('[')

  1.upto(ARTICLE_COUNT) { |i|
    title = randstr
    result_array.push(title)
    article_names_txt.puts(title)
    article_names_json.write("  \"#{title}\"")
    if ARTICLE_COUNT == i
      article_names_json.puts
    else
      article_names_json.puts ','
    end
  }

  article_names_json.puts(']')

  article_names_json.close
  article_names_txt.close
  result_array
 end

 def unique_randoms(count, max_exclusively)
  result = Set.new
  loop do
    r = rand(max_exclusively)
    raise 'bad random (too big)' unless r < max_exclusively
    result.add(r)
    return result.to_a if result.size >= count
  end
 end

 def gen_references(tag_names, article_names, maxtagsperpage)
  references = File.open("data/references_only_#{maxtagsperpage}.json", 'w')
  articles_with_tags = File.open("data/articles_with_tags_#{maxtagsperpage}.json", 'w')

  references.puts('[')
  articles_with_tags.puts('[')

  article_names.each_with_index { |article, article_index|
    tags_count = rand(0..maxtagsperpage)
    articletagindices = unique_randoms(tags_count, tag_names.size)

    selected_article_names = []
    articletagindices.each { |ti|
      selected_article_names.push(tag_names[ti])
    }

    references.write("  #{article_index}: #{JSON.generate(articletagindices)}")
    articles_with_tags.write("  {\"title\": \"#{article}\", \"tags\": #{JSON.generate(selected_article_names)}}")

    if article_index == article_names.size - 1
      references.puts
      articles_with_tags.puts
    else
      references.puts ','
      articles_with_tags.puts ','
    end
  }

  references.puts(']')
  articles_with_tags.puts(']')

  references.close
  articles_with_tags.close
 end


 def main
  Dir.mkdir('data') unless Dir.exist?('data')
  tag_names = genTags
  article_names = gen_article_names

  raise 'bad tag_names size' unless tag_names.size == TAG_COUNT
  raise 'bad article_names size' unless article_names.size == ARTICLE_COUNT
  raise 'article_names has duplicates' unless article_names.size == article_names.uniq.size

  # Генерирую три разных независимых набора данных.
  gen_references(tag_names, article_names, 20)
  gen_references(tag_names, article_names, 100)
  gen_references(tag_names, article_names, 1000)
 end

 main()
	#!/usr/bin/env ruby
	# encoding: utf-8

	require 'securerandom'
	require 'set'
	require 'json'

	TAG_COUNT = 10_000
	ARTICLE_COUNT = 1_000_000

	def randstr
	SecureRandom.base64.gsub(/\//, '').gsub(/\\/, '').gsub(/\+/, '').gsub(/=/, '')
	end

	def genTags
	tagNameSet = Set.new
	result_array = []

	tags_json = File.open('data/tags.json', 'w')
	tags_txt = File.open('data/tags.txt', 'w')

	tags_json.puts('[')

	1.upto(TAG_COUNT) { \|i\|

	# Тут вероятность пересечения выше, чем если бы я не обрезал строки.
	# Так что я беру тег, и если он уже был, пробую снова,
	# пока не найду такой, которого еще не было.
	begin
	tag = randstr[0..8]
	end while tagNameSet.include?(tag)

	tagNameSet.add(tag)
	result_array.push(tag)

	tags_txt.puts(tag)
	tags_json.write(" \"#{tag}\"")
	if TAG_COUNT == i
	tags_json.puts
	else
	tags_json.puts ','
	end
	}

	tags_json.puts(']')

	tags_json.close
	tags_txt.close
	result_array
	end

	def gen_article_names
	result_array = []

	article_names_json = File.open('data/article_names.json', 'w')
	article_names_txt = File.open('data/article_names.txt', 'w')

	article_names_json.puts('[')

	1.upto(ARTICLE_COUNT) { \|i\|
	title = randstr
	result_array.push(title)
	article_names_txt.puts(title)
	article_names_json.write(" \"#{title}\"")
	if ARTICLE_COUNT == i
	article_names_json.puts
	else
	article_names_json.puts ','
	end
	}

	article_names_json.puts(']')

	article_names_json.close
	article_names_txt.close
	result_array
	end

	def unique_randoms(count, max_exclusively)
	result = Set.new
	loop do
	r = rand(max_exclusively)
	raise 'bad random (too big)' unless r < max_exclusively
	result.add(r)
	return result.to_a if result.size >= count
	end
	end

	def gen_references(tag_names, article_names, maxtagsperpage)
	references = File.open("data/references_only_#{maxtagsperpage}.json", 'w')
	articles_with_tags = File.open("data/articles_with_tags_#{maxtagsperpage}.json", 'w')

	references.puts('[')
	articles_with_tags.puts('[')

	article_names.each_with_index { \|article, article_index\|
	tags_count = rand(0..maxtagsperpage)
	articletagindices = unique_randoms(tags_count, tag_names.size)

	selected_article_names = []
	articletagindices.each { \|ti\|
	selected_article_names.push(tag_names[ti])
	}

	references.write(" #{article_index}: #{JSON.generate(articletagindices)}")
	articles_with_tags.write(" {\"title\": \"#{article}\", \"tags\": #{JSON.generate(selected_article_names)}}")

	if article_index == article_names.size - 1
	references.puts
	articles_with_tags.puts
	else
	references.puts ','
	articles_with_tags.puts ','
	end
	}

	references.puts(']')
	articles_with_tags.puts(']')

	references.close
	articles_with_tags.close
	end


	def main
	Dir.mkdir('data') unless Dir.exist?('data')
	tag_names = genTags
	article_names = gen_article_names

	raise 'bad tag_names size' unless tag_names.size == TAG_COUNT
	raise 'bad article_names size' unless article_names.size == ARTICLE_COUNT
	raise 'article_names has duplicates' unless article_names.size == article_names.uniq.size

	# Генерирую три разных независимых набора данных.
	gen_references(tag_names, article_names, 20)
	gen_references(tag_names, article_names, 100)
	gen_references(tag_names, article_names, 1000)
	end

	main()