-
-
Save joncham/5230481 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
For some reason the memory consumed in ReadFile() isn't released | |
when getchar() in main() is hit. Why isn't the scope of the stack releasing it? | |
Also, I don't know why the output is wrong. Here is the expected and actual output. | |
EWAHBoolArray code I am using can be found here: | |
https://github.com/lemire/EWAHBoolArray | |
This code is indexing the year column. | |
INPUT | |
---------- | |
"Country or Area","Year","Comm. Code","Commodity","Flow","Trade (USD)","Weight (kg)","Quantity Name","Quantity" | |
"Afghanistan","2011","TOTAL","ALL COMMODITIES","Import","6390310947","","No Quantity","" | |
"Afghanistan","2011","TOTAL","ALL COMMODITIES","Export","375850935","","No Quantity","" | |
"Afghanistan","2010","TOTAL","ALL COMMODITIES","Import","5154249867","","No Quantity","" | |
"Afghanistan","2010","TOTAL","ALL COMMODITIES","Export","388483635","","No Quantity","" | |
"Afghanistan","2009","TOTAL","ALL COMMODITIES","Import","3336434781","","No Quantity","" | |
"Afghanistan","2009","TOTAL","ALL COMMODITIES","Export","403441006","","No Quantity","" | |
"Afghanistan","2008","TOTAL","ALL COMMODITIES","Import","3019860129","","No Quantity","" | |
"Afghanistan","2008","TOTAL","ALL COMMODITIES","Export","540065594","","No Quantity","" | |
"Albania","2011","TOTAL","ALL COMMODITIES","Import","5395853069","","No Quantity","" | |
EXPECTED | |
---------- | |
ROWS INDEXED: 9 | |
TERMS: 4 | |
ACTUAL | |
---------- | |
ROWS INDEXED: 13 - WRONG - This should be 1,000. | |
TERMS: 4 - CORRECT | |
*/ | |
#ifdef _MSC_VER | |
#define _SCL_SECURE_NO_WARNINGS | |
#endif | |
#include "stdafx.h" | |
#include <stdlib.h> | |
#include <iostream> | |
#include <vector> | |
#include <list> | |
#include <unordered_map> | |
#include <boost/cstdint.hpp> | |
#include <boost/timer/timer.hpp> | |
#include <boost/random/mersenne_twister.hpp> | |
#include <boost/random/discrete_distribution.hpp> | |
#include <boost/iostreams/device/mapped_file.hpp> | |
#include <boost/iostreams/stream.hpp> | |
#include <boost/unordered_map.hpp> | |
#include <EWAHBoolArray/headers/ewah.h> | |
#include<boost/tokenizer.hpp> | |
#include <fstream> | |
#include <string> | |
#include <algorithm> | |
#include <iterator> | |
using namespace std; | |
using namespace boost; | |
using namespace boost::timer; | |
using namespace boost::iostreams; | |
std::shared_ptr<vector<vector<string>>> ReadFile() | |
{ | |
string data("c:\\data.csv"); | |
ifstream in(data.c_str()); | |
if (!in.is_open()) | |
{ | |
return nullptr; | |
} | |
typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer; | |
boost::escaped_list_separator<char> sep('\\', ',', '\"'); | |
vector<vector<string>> *rowsVector = new vector<vector<string>>(); | |
std::shared_ptr<vector<vector<string>>> rows(rowsVector); | |
vector<string> vec; | |
vector<string> headers; | |
string line; | |
string buffer; | |
bool inside_quotes(false); | |
size_t last_quote(0); | |
int count = 0; | |
while (getline(in, buffer)) | |
{ | |
if (count == 10) | |
{ | |
// Only index 1 CSV header row and 9 actual rows of data. | |
in.close(); | |
return rows; | |
} | |
else if (count > 0) | |
{ | |
// Deal with line breaks in quoted strings | |
last_quote = buffer.find_first_of('"'); | |
while (last_quote != string::npos) | |
{ | |
inside_quotes = !inside_quotes; | |
last_quote = buffer.find_first_of('"',last_quote+1); | |
} | |
line.append(buffer); | |
if (inside_quotes) | |
{ | |
line.append("\n"); | |
continue; | |
} | |
Tokenizer tok(line, sep); | |
vec.assign(tok.begin(),tok.end()); | |
line.clear(); | |
rows->push_back(vec); | |
if (count == 0) | |
{ | |
// This is the CSV header with the column names. | |
// Do something with this later. | |
} | |
} | |
count++; | |
} | |
in.close(); | |
return rows; | |
} | |
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> IndexData(std::shared_ptr<vector<vector<string>>> rows) | |
{ | |
EWAHBoolArray<__int64> *bitmap = new EWAHBoolArray<__int64>(); | |
std::unordered_map<string, EWAHBoolArray<__int64>> *indexes = new std::unordered_map<string, EWAHBoolArray<__int64>>(); | |
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> indexMap(indexes); | |
// Index rows columns. | |
const unsigned int seed = 42; | |
__int64 rowCount = rows->size(); | |
vector<string> columns; | |
if (rowCount > 0) | |
{ | |
columns = rows->at(0); | |
} | |
for(__int64 i = 0; i < rowCount; i++) | |
{ | |
auto row = rows->at(i); | |
__int64 columnSize = row.size(); | |
for(__int64 x = 0; x < columnSize; x++) | |
{ | |
if (x == 1) | |
{ | |
// Only index the rows after the CSV header and | |
// only index the 2nd column (for now). | |
auto &index = indexes->operator[](row.at(x)); | |
if (index.sizeInBits() == 0) | |
{ | |
index.add(i); | |
} | |
else | |
{ | |
index.add(i); | |
} | |
} | |
} | |
} | |
return indexMap; | |
} | |
int _tmain(int argc, _TCHAR* argv[]) | |
{ | |
{ | |
std::shared_ptr<vector<vector<string>>> rows; | |
{ | |
cout << "READ "; | |
auto_cpu_timer timer; | |
rows = ReadFile(); | |
} | |
{ | |
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<__int64>>> indexes; | |
{ | |
cout << "INDEX "; | |
auto_cpu_timer timer; | |
indexes = IndexData(rows); | |
} | |
typedef std::unordered_map<string, EWAHBoolArray<__int64>> indexIterator; | |
__int64 rowCount = 0; | |
__int64 diskByteCount = 0; | |
__int64 compressedByteCount = 0; | |
for (indexIterator::iterator it = indexes->begin(); it != indexes->end(); ++it) | |
{ | |
rowCount += it->second.numberOfOnes(); | |
diskByteCount += it->second.sizeOnDisk(); | |
compressedByteCount += it->second.computeStatistics().getCompressedSize(); | |
} | |
cout << "ROWS INDEXED: " << rowCount << endl; | |
cout << "TERMS: " << indexes->size() << endl; | |
cout << "DISK BYTES: " << diskByteCount << endl; | |
cout << "COMPRESSED BYTES: " << compressedByteCount << endl; | |
} | |
getchar(); | |
} | |
getchar(); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment