Skip to content

Instantly share code, notes, and snippets.

@joncham
Forked from kellabyte/indexing_test.cpp
Last active December 15, 2015 08:19
Show Gist options
  • Save joncham/5230481 to your computer and use it in GitHub Desktop.
Save joncham/5230481 to your computer and use it in GitHub Desktop.
/*
For some reason the memory consumed in ReadFile() isn't released
when getchar() in main() is hit. Why isn't the scope of the stack releasing it?
Also, I don't know why the output is wrong. Here is the expected and actual output.
EWAHBoolArray code I am using can be found here:
https://github.com/lemire/EWAHBoolArray
This code is indexing the year column.
INPUT
----------
"Country or Area","Year","Comm. Code","Commodity","Flow","Trade (USD)","Weight (kg)","Quantity Name","Quantity"
"Afghanistan","2011","TOTAL","ALL COMMODITIES","Import","6390310947","","No Quantity",""
"Afghanistan","2011","TOTAL","ALL COMMODITIES","Export","375850935","","No Quantity",""
"Afghanistan","2010","TOTAL","ALL COMMODITIES","Import","5154249867","","No Quantity",""
"Afghanistan","2010","TOTAL","ALL COMMODITIES","Export","388483635","","No Quantity",""
"Afghanistan","2009","TOTAL","ALL COMMODITIES","Import","3336434781","","No Quantity",""
"Afghanistan","2009","TOTAL","ALL COMMODITIES","Export","403441006","","No Quantity",""
"Afghanistan","2008","TOTAL","ALL COMMODITIES","Import","3019860129","","No Quantity",""
"Afghanistan","2008","TOTAL","ALL COMMODITIES","Export","540065594","","No Quantity",""
"Albania","2011","TOTAL","ALL COMMODITIES","Import","5395853069","","No Quantity",""
EXPECTED
----------
ROWS INDEXED: 9
TERMS: 4
ACTUAL
----------
ROWS INDEXED: 13 - WRONG - This should be 1,000.
TERMS: 4 - CORRECT
*/
#ifdef _MSC_VER
#define _SCL_SECURE_NO_WARNINGS
#endif
#include "stdafx.h"
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <list>
#include <unordered_map>
#include <boost/cstdint.hpp>
#include <boost/timer/timer.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/discrete_distribution.hpp>
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/iostreams/stream.hpp>
#include <boost/unordered_map.hpp>
#include <EWAHBoolArray/headers/ewah.h>
#include<boost/tokenizer.hpp>
#include <fstream>
#include <string>
#include <algorithm>
#include <iterator>
using namespace std;
using namespace boost;
using namespace boost::timer;
using namespace boost::iostreams;
std::shared_ptr<vector<vector<string>>> ReadFile()
{
string data("c:\\data.csv");
ifstream in(data.c_str());
if (!in.is_open())
{
return nullptr;
}
typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
boost::escaped_list_separator<char> sep('\\', ',', '\"');
vector<vector<string>> *rowsVector = new vector<vector<string>>();
std::shared_ptr<vector<vector<string>>> rows(rowsVector);
vector<string> vec;
vector<string> headers;
string line;
string buffer;
bool inside_quotes(false);
size_t last_quote(0);
int count = 0;
while (getline(in, buffer))
{
if (count == 10)
{
// Only index 1 CSV header row and 9 actual rows of data.
in.close();
return rows;
}
else if (count > 0)
{
// Deal with line breaks in quoted strings
last_quote = buffer.find_first_of('"');
while (last_quote != string::npos)
{
inside_quotes = !inside_quotes;
last_quote = buffer.find_first_of('"',last_quote+1);
}
line.append(buffer);
if (inside_quotes)
{
line.append("\n");
continue;
}
Tokenizer tok(line, sep);
vec.assign(tok.begin(),tok.end());
line.clear();
rows->push_back(vec);
if (count == 0)
{
// This is the CSV header with the column names.
// Do something with this later.
}
}
count++;
}
in.close();
return rows;
}
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> IndexData(std::shared_ptr<vector<vector<string>>> rows)
{
EWAHBoolArray<__int64> *bitmap = new EWAHBoolArray<__int64>();
std::unordered_map<string, EWAHBoolArray<__int64>> *indexes = new std::unordered_map<string, EWAHBoolArray<__int64>>();
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> indexMap(indexes);
// Index rows columns.
const unsigned int seed = 42;
__int64 rowCount = rows->size();
vector<string> columns;
if (rowCount > 0)
{
columns = rows->at(0);
}
for(__int64 i = 0; i < rowCount; i++)
{
auto row = rows->at(i);
__int64 columnSize = row.size();
for(__int64 x = 0; x < columnSize; x++)
{
if (x == 1)
{
// Only index the rows after the CSV header and
// only index the 2nd column (for now).
auto &index = indexes->operator[](row.at(x));
if (index.sizeInBits() == 0)
{
index.add(i);
}
else
{
index.add(i);
}
}
}
}
return indexMap;
}
int _tmain(int argc, _TCHAR* argv[])
{
{
std::shared_ptr<vector<vector<string>>> rows;
{
cout << "READ ";
auto_cpu_timer timer;
rows = ReadFile();
}
{
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> indexes;
{
cout << "INDEX ";
auto_cpu_timer timer;
indexes = IndexData(rows);
}
typedef std::unordered_map<string, EWAHBoolArray<__int64>> indexIterator;
__int64 rowCount = 0;
__int64 diskByteCount = 0;
__int64 compressedByteCount = 0;
for (indexIterator::iterator it = indexes->begin(); it != indexes->end(); ++it)
{
rowCount += it->second.numberOfOnes();
diskByteCount += it->second.sizeOnDisk();
compressedByteCount += it->second.computeStatistics().getCompressedSize();
}
cout << "ROWS INDEXED: " << rowCount << endl;
cout << "TERMS: " << indexes->size() << endl;
cout << "DISK BYTES: " << diskByteCount << endl;
cout << "COMPRESSED BYTES: " << compressedByteCount << endl;
}
getchar();
}
getchar();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment