kellabyte · December 15, 2015 08:19 · bitcrazed · Mar 24, 2013 · lemire · Mar 27, 2013
diff --git a/indexing_test.cpp b/indexing_test.cpp
 /* SAMPLE CSV FILE */
 /*
 "Country or Area","Year","Comm. Code","Commodity","Flow","Trade (USD)","Weight (kg)","Quantity Name","Quantity"
 "Afghanistan","2011","TOTAL","ALL COMMODITIES","Import","6390310947","","No Quantity",""
 "Afghanistan","2011","TOTAL","ALL COMMODITIES","Export","375850935","","No Quantity",""
 "Afghanistan","2010","TOTAL","ALL COMMODITIES","Import","5154249867","","No Quantity",""
 "Afghanistan","2010","TOTAL","ALL COMMODITIES","Export","388483635","","No Quantity",""
 "Afghanistan","2009","TOTAL","ALL COMMODITIES","Import","3336434781","","No Quantity",""
 "Afghanistan","2009","TOTAL","ALL COMMODITIES","Export","403441006","","No Quantity",""
 "Afghanistan","2008","TOTAL","ALL COMMODITIES","Import","3019860129","","No Quantity",""
 "Afghanistan","2008","TOTAL","ALL COMMODITIES","Export","540065594","","No Quantity",""
 "Albania","2011","TOTAL","ALL COMMODITIES","Import","5395853069","","No Quantity",""
 "Albania","2011","TOTAL","ALL COMMODITIES","Export","1948207305","","No Quantity",""
 "Albania","2010","TOTAL","ALL COMMODITIES","Import","4602774967","","No Quantity",""
 "Albania","2010","TOTAL","ALL COMMODITIES","Export","1549955724","","No Quantity",""
 "Albania","2010","TOTAL","ALL COMMODITIES","Re-Import","26393","","No Quantity",""
 "Albania","2009","TOTAL","ALL COMMODITIES","Import","4548287875","","No Quantity",""
 "Albania","2009","TOTAL","ALL COMMODITIES","Export","1087914902","","No Quantity",""
 "Albania","2009","TOTAL","ALL COMMODITIES","Re-Import","272403","","No Quantity",""
 "Albania","2008","TOTAL","ALL COMMODITIES","Import","5250490022","","No Quantity",""
 "Albania","2008","TOTAL","ALL COMMODITIES","Export","1354921653","","No Quantity",""
 "Albania","2008","TOTAL","ALL COMMODITIES","Re-Export","810868093","","No Quantity",""
 "Albania","2008","TOTAL","ALL COMMODITIES","Re-Import","509068","","No Quantity",""
 "Albania","2007","TOTAL","ALL COMMODITIES","Import","4200864046","","No Quantity",""
 "Albania","2007","TOTAL","ALL COMMODITIES","Export","1077690359","","No Quantity",""
 "Albania","2007","TOTAL","ALL COMMODITIES","Re-Import","4494753","","No Quantity",""
 */

 #ifdef _MSC_VER
 #define _SCL_SECURE_NO_WARNINGS
 #endif

 #include "stdafx.h"
 #include <stdlib.h>
 #include <iostream>
 #include <vector>
 #include <list>
 #include <unordered_map>
 #include <boost/cstdint.hpp>
 #include <boost/timer/timer.hpp>
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/random/discrete_distribution.hpp>
 #include <boost/iostreams/device/mapped_file.hpp>
 #include <boost/iostreams/stream.hpp>
 #include <boost/unordered_map.hpp>
 #include <strtk\strtk.hpp>

 #include <EWAHBoolArray/headers/ewah.h>

 #include<boost/tokenizer.hpp>
 #include <fstream>      // fstream
 #include <string>
 #include <algorithm>    // copy
 #include <iterator>     // ostream_operator

 #include "MurmurHash3.h"

 using namespace std;
 using namespace boost;
 using namespace boost::timer;
 using namespace boost::random;
 using namespace boost::iostreams;

 int const rowCount = 1000;
 int const indexCount = 100;

 std::shared_ptr<vector<vector<string>>> ReadFile()
 {
    string data("c:\\data.csv");

    ifstream in(data.c_str());
    if (!in.is_open())
    {
        return nullptr;
    }

    typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;    
    boost::escaped_list_separator<char> sep('\\', ',', '\"');

    vector<vector<string>> *rowsVector = new vector<vector<string>>();
    std::shared_ptr<vector<vector<string>>> rows(rowsVector);

    vector<string> vec;
    vector<string> headers;
    string line;
    string buffer;

    bool inside_quotes(false);
    size_t last_quote(0);
    int count = 0;

    while (getline(in, buffer))
    {
        if (count == 25)
        {
            // Only index 1 CSV header row and 24 actual rows of data.
            in.close();
            return rows;
        }
        else if (count > 0)
        {
            // Deal with line breaks in quoted strings
            last_quote = buffer.find_first_of('"');
            while (last_quote != string::npos)
            {
                inside_quotes = !inside_quotes;
                last_quote = buffer.find_first_of('"',last_quote+1);
            }

            line.append(buffer);

            if (inside_quotes)
            {
                line.append("\n");
                continue;
            }

            Tokenizer tok(line, sep);
            vec.assign(tok.begin(),tok.end());
            line.clear();
            rows->push_back(vec);

            if (count == 0)
            {
                // This is the CSV header with the column names.
                // Do something with this later.
            }
        }
        count++;
    }

    size_t wastedMemory = 0;
    for (int i = 0; i < rows->size(); ++i)
    {
        for (int j = 0; j < rows->at(i).size(); ++j)
        {
            wastedMemory += rows->at(i)[j].capacity() - rows->at(i)[j].size();
        }
    }
    cout << "Wasted memory" << wastedMemory << endl;

    in.close();
    return rows;
 }

 std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> IndexData(std::shared_ptr<vector<vector<string>>> rows)
 {
    std::unordered_map<string, EWAHBoolArray<uint32_t>> *indexes = new std::unordered_map<string, EWAHBoolArray<uint32_t>>();
    std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> indexMap(indexes);

    // Index rows columns.
    __int64 rowCount = rows->size();
    vector<string> columns;

    if (rowCount > 0)
    {
        columns = rows->at(0);
    }

    for(__int64 i = 0; i < rowCount; i++)
    {
        auto row = rows->at(i);

        __int64 columnSize = row.size();
        for(__int64 x = 0; x < columnSize; x++)
        {
            if (x == 1)
            {
                // Only index the rows after the CSV header and
                // only index the 2nd column (for now).
                auto &index = indexes->operator[](row.at(x));
                if (index.sizeInBits() == 0)
                {
                    index.set(i);
                }
                else
                {
                    index.set(i);
                }
            }
        }
    }
    return indexMap;
 }

 int _tmain(int argc, _TCHAR* argv[])
 {
    std::shared_ptr<vector<vector<string>>> rows;
    {
        cout << "READ ";
        auto_cpu_timer timer;
        rows = ReadFile();
    }

    {
        std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> indexes;
        {
            cout << "INDEX ";
            auto_cpu_timer timer;
            indexes = IndexData(rows);
        }

        typedef std::unordered_map<string, EWAHBoolArray<uint32_t>> indexIterator; 

        __int64 capacityCount = 0;
        __int64 rowCount = 0;
        __int64 actualRowCount = 0;
        __int64 diskByteCount = 0;
        __int64 compressedByteCount = 0;

        for(const auto it: *indexes)
        {
            for (EWAHBoolArray<uint32_t>::const_iterator it2 = it.second.begin(); it2 != it.second.end(); ++it2)
            {
                actualRowCount++;
            }
            rowCount += it.second.numberOfOnes();
            diskByteCount += it.second.sizeOnDisk();
            compressedByteCount += it.second.computeStatistics().getCompressedSize();
        }

        cout << "TOTAL CAPACITY: " << rows->capacity() << endl;         // 28
        cout << "TOTAL ROWS: " << rows->size() << endl;                 // 24
        cout << "ACTUAL ROWS: " << actualRowCount << endl;              // 26 ?? Why is this wrong?
        cout << "INDEXED ROWS: " << rowCount << endl;                   // 24
        cout << "TERMS: " << indexes->size() << endl;                   // 6
        cout << "DISK BYTES: " << diskByteCount << endl;                // 144
        cout << "COMPRESSED BYTES: " << compressedByteCount << endl;    // 12
    }

    getchar();
    return 0;
 }
	/* SAMPLE CSV FILE */
	/*
	"Country or Area","Year","Comm. Code","Commodity","Flow","Trade (USD)","Weight (kg)","Quantity Name","Quantity"
	"Afghanistan","2011","TOTAL","ALL COMMODITIES","Import","6390310947","","No Quantity",""
	"Afghanistan","2011","TOTAL","ALL COMMODITIES","Export","375850935","","No Quantity",""
	"Afghanistan","2010","TOTAL","ALL COMMODITIES","Import","5154249867","","No Quantity",""
	"Afghanistan","2010","TOTAL","ALL COMMODITIES","Export","388483635","","No Quantity",""
	"Afghanistan","2009","TOTAL","ALL COMMODITIES","Import","3336434781","","No Quantity",""
	"Afghanistan","2009","TOTAL","ALL COMMODITIES","Export","403441006","","No Quantity",""
	"Afghanistan","2008","TOTAL","ALL COMMODITIES","Import","3019860129","","No Quantity",""
	"Afghanistan","2008","TOTAL","ALL COMMODITIES","Export","540065594","","No Quantity",""
	"Albania","2011","TOTAL","ALL COMMODITIES","Import","5395853069","","No Quantity",""
	"Albania","2011","TOTAL","ALL COMMODITIES","Export","1948207305","","No Quantity",""
	"Albania","2010","TOTAL","ALL COMMODITIES","Import","4602774967","","No Quantity",""
	"Albania","2010","TOTAL","ALL COMMODITIES","Export","1549955724","","No Quantity",""
	"Albania","2010","TOTAL","ALL COMMODITIES","Re-Import","26393","","No Quantity",""
	"Albania","2009","TOTAL","ALL COMMODITIES","Import","4548287875","","No Quantity",""
	"Albania","2009","TOTAL","ALL COMMODITIES","Export","1087914902","","No Quantity",""
	"Albania","2009","TOTAL","ALL COMMODITIES","Re-Import","272403","","No Quantity",""
	"Albania","2008","TOTAL","ALL COMMODITIES","Import","5250490022","","No Quantity",""
	"Albania","2008","TOTAL","ALL COMMODITIES","Export","1354921653","","No Quantity",""
	"Albania","2008","TOTAL","ALL COMMODITIES","Re-Export","810868093","","No Quantity",""
	"Albania","2008","TOTAL","ALL COMMODITIES","Re-Import","509068","","No Quantity",""
	"Albania","2007","TOTAL","ALL COMMODITIES","Import","4200864046","","No Quantity",""
	"Albania","2007","TOTAL","ALL COMMODITIES","Export","1077690359","","No Quantity",""
	"Albania","2007","TOTAL","ALL COMMODITIES","Re-Import","4494753","","No Quantity",""
	*/

	#ifdef _MSC_VER
	#define _SCL_SECURE_NO_WARNINGS
	#endif

	#include "stdafx.h"
	#include <stdlib.h>
	#include <iostream>
	#include <vector>
	#include <list>
	#include <unordered_map>
	#include <boost/cstdint.hpp>
	#include <boost/timer/timer.hpp>
	#include <boost/random/mersenne_twister.hpp>
	#include <boost/random/discrete_distribution.hpp>
	#include <boost/iostreams/device/mapped_file.hpp>
	#include <boost/iostreams/stream.hpp>
	#include <boost/unordered_map.hpp>
	#include <strtk\strtk.hpp>

	#include <EWAHBoolArray/headers/ewah.h>

	#include<boost/tokenizer.hpp>
	#include <fstream> // fstream
	#include <string>
	#include <algorithm> // copy
	#include <iterator> // ostream_operator

	#include "MurmurHash3.h"

	using namespace std;
	using namespace boost;
	using namespace boost::timer;
	using namespace boost::random;
	using namespace boost::iostreams;

	int const rowCount = 1000;
	int const indexCount = 100;

	std::shared_ptr<vector<vector<string>>> ReadFile()
	{
	string data("c:\\data.csv");

	ifstream in(data.c_str());
	if (!in.is_open())
	{
	return nullptr;
	}

	typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
	boost::escaped_list_separator<char> sep('\\', ',', '\"');

	vector<vector<string>> *rowsVector = new vector<vector<string>>();
	std::shared_ptr<vector<vector<string>>> rows(rowsVector);

	vector<string> vec;
	vector<string> headers;
	string line;
	string buffer;

	bool inside_quotes(false);
	size_t last_quote(0);
	int count = 0;

	while (getline(in, buffer))
	{
	if (count == 25)
	{
	// Only index 1 CSV header row and 24 actual rows of data.
	in.close();
	return rows;
	}
	else if (count > 0)
	{
	// Deal with line breaks in quoted strings
	last_quote = buffer.find_first_of('"');
	while (last_quote != string::npos)
	{
	inside_quotes = !inside_quotes;
	last_quote = buffer.find_first_of('"',last_quote+1);
	}

	line.append(buffer);

	if (inside_quotes)
	{
	line.append("\n");
	continue;
	}

	Tokenizer tok(line, sep);
	vec.assign(tok.begin(),tok.end());
	line.clear();
	rows->push_back(vec);

	if (count == 0)
	{
	// This is the CSV header with the column names.
	// Do something with this later.
	}
	}
	count++;
	}

	size_t wastedMemory = 0;
	for (int i = 0; i < rows->size(); ++i)
	{
	for (int j = 0; j < rows->at(i).size(); ++j)
	{
	wastedMemory += rows->at(i)[j].capacity() - rows->at(i)[j].size();
	}
	}
	cout << "Wasted memory" << wastedMemory << endl;

	in.close();
	return rows;
	}

	std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> IndexData(std::shared_ptr<vector<vector<string>>> rows)
	{
	std::unordered_map<string, EWAHBoolArray<uint32_t>> *indexes = new std::unordered_map<string, EWAHBoolArray<uint32_t>>();
	std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> indexMap(indexes);

	// Index rows columns.
	__int64 rowCount = rows->size();
	vector<string> columns;

	if (rowCount > 0)
	{
	columns = rows->at(0);
	}

	for(__int64 i = 0; i < rowCount; i++)
	{
	auto row = rows->at(i);

	__int64 columnSize = row.size();
	for(__int64 x = 0; x < columnSize; x++)
	{
	if (x == 1)
	{
	// Only index the rows after the CSV header and
	// only index the 2nd column (for now).
	auto &index = indexes->operator[](row.at(x));
	if (index.sizeInBits() == 0)
	{
	index.set(i);
	}
	else
	{
	index.set(i);
	}
	}
	}
	}
	return indexMap;
	}

	int _tmain(int argc, _TCHAR* argv[])
	{
	std::shared_ptr<vector<vector<string>>> rows;
	{
	cout << "READ ";
	auto_cpu_timer timer;
	rows = ReadFile();
	}

	{
	std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> indexes;
	{
	cout << "INDEX ";
	auto_cpu_timer timer;
	indexes = IndexData(rows);
	}

	typedef std::unordered_map<string, EWAHBoolArray<uint32_t>> indexIterator;

	__int64 capacityCount = 0;
	__int64 rowCount = 0;
	__int64 actualRowCount = 0;
	__int64 diskByteCount = 0;
	__int64 compressedByteCount = 0;

	for(const auto it: *indexes)
	{
	for (EWAHBoolArray<uint32_t>::const_iterator it2 = it.second.begin(); it2 != it.second.end(); ++it2)
	{
	actualRowCount++;
	}
	rowCount += it.second.numberOfOnes();
	diskByteCount += it.second.sizeOnDisk();
	compressedByteCount += it.second.computeStatistics().getCompressedSize();
	}

	cout << "TOTAL CAPACITY: " << rows->capacity() << endl; // 28
	cout << "TOTAL ROWS: " << rows->size() << endl; // 24
	cout << "ACTUAL ROWS: " << actualRowCount << endl; // 26 ?? Why is this wrong?
	cout << "INDEXED ROWS: " << rowCount << endl; // 24
	cout << "TERMS: " << indexes->size() << endl; // 6
	cout << "DISK BYTES: " << diskByteCount << endl; // 144
	cout << "COMPRESSED BYTES: " << compressedByteCount << endl; // 12
	}

	getchar();
	return 0;
	}