joncham · December 15, 2015 08:19
diff --git a/indexing_test.cpp b/indexing_test.cpp
 /*  
  For some reason the memory consumed in ReadFile() isn't released 
  when getchar() in main() is hit. Why isn't the scope of the stack releasing it?

  Also, I don't know why the output is wrong. Here is the expected and actual output.

  EWAHBoolArray code I am using can be found here:
  https://github.com/lemire/EWAHBoolArray
  
  This code is indexing the year column.

  INPUT
  ----------
  "Country or Area","Year","Comm. Code","Commodity","Flow","Trade (USD)","Weight (kg)","Quantity Name","Quantity"
  "Afghanistan","2011","TOTAL","ALL COMMODITIES","Import","6390310947","","No Quantity",""
  "Afghanistan","2011","TOTAL","ALL COMMODITIES","Export","375850935","","No Quantity",""
  "Afghanistan","2010","TOTAL","ALL COMMODITIES","Import","5154249867","","No Quantity",""
  "Afghanistan","2010","TOTAL","ALL COMMODITIES","Export","388483635","","No Quantity",""
  "Afghanistan","2009","TOTAL","ALL COMMODITIES","Import","3336434781","","No Quantity",""
  "Afghanistan","2009","TOTAL","ALL COMMODITIES","Export","403441006","","No Quantity",""
  "Afghanistan","2008","TOTAL","ALL COMMODITIES","Import","3019860129","","No Quantity",""
  "Afghanistan","2008","TOTAL","ALL COMMODITIES","Export","540065594","","No Quantity",""
  "Albania","2011","TOTAL","ALL COMMODITIES","Import","5395853069","","No Quantity",""

  EXPECTED
  ----------
  ROWS INDEXED: 9
  TERMS: 4

  ACTUAL
  ----------
  ROWS INDEXED: 13  - WRONG - This should be 1,000.
  TERMS: 4          - CORRECT
  */

 #ifdef _MSC_VER
 #define _SCL_SECURE_NO_WARNINGS
 #endif

 #include "stdafx.h"
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
 #include <list>
 #include <unordered_map>
 #include <boost/cstdint.hpp>
 #include <boost/timer/timer.hpp>
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/random/discrete_distribution.hpp>
 #include <boost/iostreams/device/mapped_file.hpp>
 #include <boost/iostreams/stream.hpp>
 #include <boost/unordered_map.hpp>

 #include <EWAHBoolArray/headers/ewah.h>

 #include<boost/tokenizer.hpp>
 #include <fstream>
 #include <string>
 #include <algorithm>
 #include <iterator>

 using namespace std;
 using namespace boost;
 using namespace boost::timer;
 using namespace boost::iostreams;

 std::shared_ptr<vector<vector<string>>> ReadFile()
 {
 	string data("c:\\data.csv");

 	ifstream in(data.c_str());
 	if (!in.is_open())
 	{
 		return nullptr;
 	}
 	
 	typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;    
 	boost::escaped_list_separator<char> sep('\\', ',', '\"');

 	vector<vector<string>> *rowsVector = new vector<vector<string>>();
 	std::shared_ptr<vector<vector<string>>> rows(rowsVector);
 	
 	vector<string> vec;
 	vector<string> headers;
 	string line;
 	string buffer;

 	bool inside_quotes(false);
 	size_t last_quote(0);
 	int count = 0;

 	while (getline(in, buffer))
 	{
 		if (count == 10)
 		{
 			// Only index 1 CSV header row and 9 actual rows of data.
 			in.close();
 			return rows;
 		}
 		else if (count > 0)
 		{
 			// Deal with line breaks in quoted strings
 			last_quote = buffer.find_first_of('"');
 			while (last_quote != string::npos)
 			{
 				inside_quotes = !inside_quotes;
 				last_quote = buffer.find_first_of('"',last_quote+1);
 			}

 			line.append(buffer);
 			
 			if (inside_quotes)
 			{
 				line.append("\n");
 				continue;
 			}

 			Tokenizer tok(line, sep);
 			vec.assign(tok.begin(),tok.end());
 			line.clear();
 			rows->push_back(vec);

 			if (count == 0)
 			{
 				// This is the CSV header with the column names.
 				// Do something with this later.
 			}
 		}
 		count++;
 	}
 	in.close();
 	return rows;
 }

 std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> IndexData(std::shared_ptr<vector<vector<string>>> rows)
 {
 	EWAHBoolArray<__int64> *bitmap = new EWAHBoolArray<__int64>();
 	std::unordered_map<string, EWAHBoolArray<__int64>> *indexes = new std::unordered_map<string, EWAHBoolArray<__int64>>();
 	std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> indexMap(indexes);

 	// Index rows columns.
 	const unsigned int seed = 42;
 	__int64 rowCount = rows->size();
 	vector<string> columns;

 	if (rowCount > 0)
 	{
 		columns = rows->at(0);
 	}

 	for(__int64 i = 0; i < rowCount; i++)
 	{
 		auto row = rows->at(i);

 		__int64 columnSize = row.size();
 		for(__int64 x = 0; x < columnSize; x++)
 		{
 			if (x == 1)
 			{
 				// Only index the rows after the CSV header and
 				// only index the 2nd column (for now).
 				auto &index = indexes->operator[](row.at(x));
 				if (index.sizeInBits() == 0)
 				{
 					index.add(i);
 				}
 				else
 				{
 					index.add(i);
 				}
 			}
 		}
 	}
 	return indexMap;
 }

 int _tmain(int argc, _TCHAR* argv[])
 {
 	{
 		std::shared_ptr<vector<vector<string>>> rows;
 		{
 			cout << "READ ";
 			auto_cpu_timer timer;
 			rows = ReadFile();
 		}
 		
 		{
 	
 			std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> indexes;
 			{
 				cout << "INDEX ";
 				auto_cpu_timer timer;
 				indexes = IndexData(rows);
 			}
 		
 			typedef std::unordered_map<string, EWAHBoolArray<__int64>> indexIterator; 
 		
 			__int64 rowCount = 0;
 			__int64 diskByteCount = 0;
 			__int64 compressedByteCount = 0;
 		
 			for (indexIterator::iterator it = indexes->begin(); it != indexes->end(); ++it) 
 			{
 				rowCount += it->second.numberOfOnes();
 				diskByteCount += it->second.sizeOnDisk();
 				compressedByteCount += it->second.computeStatistics().getCompressedSize();
 			}
 		
 			cout << "ROWS INDEXED: " << rowCount << endl;
 			cout << "TERMS: " << indexes->size() << endl;
 			cout << "DISK BYTES: " << diskByteCount << endl;
 			cout << "COMPRESSED BYTES: " << compressedByteCount << endl;
 		
 		}
 		
 		getchar();
 	}

 	getchar();
 	return 0;
 }
	/*
	For some reason the memory consumed in ReadFile() isn't released
	when getchar() in main() is hit. Why isn't the scope of the stack releasing it?

	Also, I don't know why the output is wrong. Here is the expected and actual output.

	EWAHBoolArray code I am using can be found here:
	https://github.com/lemire/EWAHBoolArray

	This code is indexing the year column.

	INPUT
	----------
	"Country or Area","Year","Comm. Code","Commodity","Flow","Trade (USD)","Weight (kg)","Quantity Name","Quantity"
	"Afghanistan","2011","TOTAL","ALL COMMODITIES","Import","6390310947","","No Quantity",""
	"Afghanistan","2011","TOTAL","ALL COMMODITIES","Export","375850935","","No Quantity",""
	"Afghanistan","2010","TOTAL","ALL COMMODITIES","Import","5154249867","","No Quantity",""
	"Afghanistan","2010","TOTAL","ALL COMMODITIES","Export","388483635","","No Quantity",""
	"Afghanistan","2009","TOTAL","ALL COMMODITIES","Import","3336434781","","No Quantity",""
	"Afghanistan","2009","TOTAL","ALL COMMODITIES","Export","403441006","","No Quantity",""
	"Afghanistan","2008","TOTAL","ALL COMMODITIES","Import","3019860129","","No Quantity",""
	"Afghanistan","2008","TOTAL","ALL COMMODITIES","Export","540065594","","No Quantity",""
	"Albania","2011","TOTAL","ALL COMMODITIES","Import","5395853069","","No Quantity",""

	EXPECTED
	----------
	ROWS INDEXED: 9
	TERMS: 4

	ACTUAL
	----------
	ROWS INDEXED: 13 - WRONG - This should be 1,000.
	TERMS: 4 - CORRECT
	*/

	#ifdef _MSC_VER
	#define _SCL_SECURE_NO_WARNINGS
	#endif

	#include "stdafx.h"
	#include <stdlib.h>
	#include <iostream>
	#include <vector>
	#include <list>
	#include <unordered_map>
	#include <boost/cstdint.hpp>
	#include <boost/timer/timer.hpp>
	#include <boost/random/mersenne_twister.hpp>
	#include <boost/random/discrete_distribution.hpp>
	#include <boost/iostreams/device/mapped_file.hpp>
	#include <boost/iostreams/stream.hpp>
	#include <boost/unordered_map.hpp>

	#include <EWAHBoolArray/headers/ewah.h>

	#include<boost/tokenizer.hpp>
	#include <fstream>
	#include <string>
	#include <algorithm>
	#include <iterator>

	using namespace std;
	using namespace boost;
	using namespace boost::timer;
	using namespace boost::iostreams;

	std::shared_ptr<vector<vector<string>>> ReadFile()
	{
	string data("c:\\data.csv");

	ifstream in(data.c_str());
	if (!in.is_open())
	{
	return nullptr;
	}

	typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
	boost::escaped_list_separator<char> sep('\\', ',', '\"');

	vector<vector<string>> *rowsVector = new vector<vector<string>>();
	std::shared_ptr<vector<vector<string>>> rows(rowsVector);

	vector<string> vec;
	vector<string> headers;
	string line;
	string buffer;

	bool inside_quotes(false);
	size_t last_quote(0);
	int count = 0;

	while (getline(in, buffer))
	{
	if (count == 10)
	{
	// Only index 1 CSV header row and 9 actual rows of data.
	in.close();
	return rows;
	}
	else if (count > 0)
	{
	// Deal with line breaks in quoted strings
	last_quote = buffer.find_first_of('"');
	while (last_quote != string::npos)
	{
	inside_quotes = !inside_quotes;
	last_quote = buffer.find_first_of('"',last_quote+1);
	}

	line.append(buffer);

	if (inside_quotes)
	{
	line.append("\n");
	continue;
	}

	Tokenizer tok(line, sep);
	vec.assign(tok.begin(),tok.end());
	line.clear();
	rows->push_back(vec);

	if (count == 0)
	{
	// This is the CSV header with the column names.
	// Do something with this later.
	}
	}
	count++;
	}
	in.close();
	return rows;
	}

	std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> IndexData(std::shared_ptr<vector<vector<string>>> rows)
	{
	EWAHBoolArray<__int64> *bitmap = new EWAHBoolArray<__int64>();
	std::unordered_map<string, EWAHBoolArray<__int64>> *indexes = new std::unordered_map<string, EWAHBoolArray<__int64>>();
	std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> indexMap(indexes);

	// Index rows columns.
	const unsigned int seed = 42;
	__int64 rowCount = rows->size();
	vector<string> columns;

	if (rowCount > 0)
	{
	columns = rows->at(0);
	}

	for(__int64 i = 0; i < rowCount; i++)
	{
	auto row = rows->at(i);

	__int64 columnSize = row.size();
	for(__int64 x = 0; x < columnSize; x++)
	{
	if (x == 1)
	{
	// Only index the rows after the CSV header and
	// only index the 2nd column (for now).
	auto &index = indexes->operator[](row.at(x));
	if (index.sizeInBits() == 0)
	{
	index.add(i);
	}
	else
	{
	index.add(i);
	}
	}
	}
	}
	return indexMap;
	}

	int _tmain(int argc, _TCHAR* argv[])
	{
	{
	std::shared_ptr<vector<vector<string>>> rows;
	{
	cout << "READ ";
	auto_cpu_timer timer;
	rows = ReadFile();
	}

	{

	std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> indexes;
	{
	cout << "INDEX ";
	auto_cpu_timer timer;
	indexes = IndexData(rows);
	}

	typedef std::unordered_map<string, EWAHBoolArray<__int64>> indexIterator;

	__int64 rowCount = 0;
	__int64 diskByteCount = 0;
	__int64 compressedByteCount = 0;

	for (indexIterator::iterator it = indexes->begin(); it != indexes->end(); ++it)
	{
	rowCount += it->second.numberOfOnes();
	diskByteCount += it->second.sizeOnDisk();
	compressedByteCount += it->second.computeStatistics().getCompressedSize();
	}

	cout << "ROWS INDEXED: " << rowCount << endl;
	cout << "TERMS: " << indexes->size() << endl;
	cout << "DISK BYTES: " << diskByteCount << endl;
	cout << "COMPRESSED BYTES: " << compressedByteCount << endl;

	}

	getchar();
	}

	getchar();
	return 0;
	}