Last active
December 15, 2015 08:19
-
-
Save kellabyte/5230423 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* SAMPLE CSV FILE */ | |
/* | |
"Country or Area","Year","Comm. Code","Commodity","Flow","Trade (USD)","Weight (kg)","Quantity Name","Quantity" | |
"Afghanistan","2011","TOTAL","ALL COMMODITIES","Import","6390310947","","No Quantity","" | |
"Afghanistan","2011","TOTAL","ALL COMMODITIES","Export","375850935","","No Quantity","" | |
"Afghanistan","2010","TOTAL","ALL COMMODITIES","Import","5154249867","","No Quantity","" | |
"Afghanistan","2010","TOTAL","ALL COMMODITIES","Export","388483635","","No Quantity","" | |
"Afghanistan","2009","TOTAL","ALL COMMODITIES","Import","3336434781","","No Quantity","" | |
"Afghanistan","2009","TOTAL","ALL COMMODITIES","Export","403441006","","No Quantity","" | |
"Afghanistan","2008","TOTAL","ALL COMMODITIES","Import","3019860129","","No Quantity","" | |
"Afghanistan","2008","TOTAL","ALL COMMODITIES","Export","540065594","","No Quantity","" | |
"Albania","2011","TOTAL","ALL COMMODITIES","Import","5395853069","","No Quantity","" | |
"Albania","2011","TOTAL","ALL COMMODITIES","Export","1948207305","","No Quantity","" | |
"Albania","2010","TOTAL","ALL COMMODITIES","Import","4602774967","","No Quantity","" | |
"Albania","2010","TOTAL","ALL COMMODITIES","Export","1549955724","","No Quantity","" | |
"Albania","2010","TOTAL","ALL COMMODITIES","Re-Import","26393","","No Quantity","" | |
"Albania","2009","TOTAL","ALL COMMODITIES","Import","4548287875","","No Quantity","" | |
"Albania","2009","TOTAL","ALL COMMODITIES","Export","1087914902","","No Quantity","" | |
"Albania","2009","TOTAL","ALL COMMODITIES","Re-Import","272403","","No Quantity","" | |
"Albania","2008","TOTAL","ALL COMMODITIES","Import","5250490022","","No Quantity","" | |
"Albania","2008","TOTAL","ALL COMMODITIES","Export","1354921653","","No Quantity","" | |
"Albania","2008","TOTAL","ALL COMMODITIES","Re-Export","810868093","","No Quantity","" | |
"Albania","2008","TOTAL","ALL COMMODITIES","Re-Import","509068","","No Quantity","" | |
"Albania","2007","TOTAL","ALL COMMODITIES","Import","4200864046","","No Quantity","" | |
"Albania","2007","TOTAL","ALL COMMODITIES","Export","1077690359","","No Quantity","" | |
"Albania","2007","TOTAL","ALL COMMODITIES","Re-Import","4494753","","No Quantity","" | |
*/ | |
#ifdef _MSC_VER | |
#define _SCL_SECURE_NO_WARNINGS | |
#endif | |
#include "stdafx.h" | |
#include <stdlib.h> | |
#include <iostream> | |
#include <vector> | |
#include <list> | |
#include <unordered_map> | |
#include <boost/cstdint.hpp> | |
#include <boost/timer/timer.hpp> | |
#include <boost/random/mersenne_twister.hpp> | |
#include <boost/random/discrete_distribution.hpp> | |
#include <boost/iostreams/device/mapped_file.hpp> | |
#include <boost/iostreams/stream.hpp> | |
#include <boost/unordered_map.hpp> | |
#include <strtk\strtk.hpp> | |
#include <EWAHBoolArray/headers/ewah.h> | |
#include<boost/tokenizer.hpp> | |
#include <fstream> // fstream | |
#include <string> | |
#include <algorithm> // copy | |
#include <iterator> // ostream_operator | |
#include "MurmurHash3.h" | |
using namespace std; | |
using namespace boost; | |
using namespace boost::timer; | |
using namespace boost::random; | |
using namespace boost::iostreams; | |
int const rowCount = 1000; | |
int const indexCount = 100; | |
std::shared_ptr<vector<vector<string>>> ReadFile() | |
{ | |
string data("c:\\data.csv"); | |
ifstream in(data.c_str()); | |
if (!in.is_open()) | |
{ | |
return nullptr; | |
} | |
typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer; | |
boost::escaped_list_separator<char> sep('\\', ',', '\"'); | |
vector<vector<string>> *rowsVector = new vector<vector<string>>(); | |
std::shared_ptr<vector<vector<string>>> rows(rowsVector); | |
vector<string> vec; | |
vector<string> headers; | |
string line; | |
string buffer; | |
bool inside_quotes(false); | |
size_t last_quote(0); | |
int count = 0; | |
while (getline(in, buffer)) | |
{ | |
if (count == 25) | |
{ | |
// Only index 1 CSV header row and 24 actual rows of data. | |
in.close(); | |
return rows; | |
} | |
else if (count > 0) | |
{ | |
// Deal with line breaks in quoted strings | |
last_quote = buffer.find_first_of('"'); | |
while (last_quote != string::npos) | |
{ | |
inside_quotes = !inside_quotes; | |
last_quote = buffer.find_first_of('"',last_quote+1); | |
} | |
line.append(buffer); | |
if (inside_quotes) | |
{ | |
line.append("\n"); | |
continue; | |
} | |
Tokenizer tok(line, sep); | |
vec.assign(tok.begin(),tok.end()); | |
line.clear(); | |
rows->push_back(vec); | |
if (count == 0) | |
{ | |
// This is the CSV header with the column names. | |
// Do something with this later. | |
} | |
} | |
count++; | |
} | |
size_t wastedMemory = 0; | |
for (int i = 0; i < rows->size(); ++i) | |
{ | |
for (int j = 0; j < rows->at(i).size(); ++j) | |
{ | |
wastedMemory += rows->at(i)[j].capacity() - rows->at(i)[j].size(); | |
} | |
} | |
cout << "Wasted memory" << wastedMemory << endl; | |
in.close(); | |
return rows; | |
} | |
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> IndexData(std::shared_ptr<vector<vector<string>>> rows) | |
{ | |
std::unordered_map<string, EWAHBoolArray<uint32_t>> *indexes = new std::unordered_map<string, EWAHBoolArray<uint32_t>>(); | |
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> indexMap(indexes); | |
// Index rows columns. | |
__int64 rowCount = rows->size(); | |
vector<string> columns; | |
if (rowCount > 0) | |
{ | |
columns = rows->at(0); | |
} | |
for(__int64 i = 0; i < rowCount; i++) | |
{ | |
auto row = rows->at(i); | |
__int64 columnSize = row.size(); | |
for(__int64 x = 0; x < columnSize; x++) | |
{ | |
if (x == 1) | |
{ | |
// Only index the rows after the CSV header and | |
// only index the 2nd column (for now). | |
auto &index = indexes->operator[](row.at(x)); | |
if (index.sizeInBits() == 0) | |
{ | |
index.set(i); | |
} | |
else | |
{ | |
index.set(i); | |
} | |
} | |
} | |
} | |
return indexMap; | |
} | |
int _tmain(int argc, _TCHAR* argv[]) | |
{ | |
std::shared_ptr<vector<vector<string>>> rows; | |
{ | |
cout << "READ "; | |
auto_cpu_timer timer; | |
rows = ReadFile(); | |
} | |
{ | |
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> indexes; | |
{ | |
cout << "INDEX "; | |
auto_cpu_timer timer; | |
indexes = IndexData(rows); | |
} | |
typedef std::unordered_map<string, EWAHBoolArray<uint32_t>> indexIterator; | |
__int64 capacityCount = 0; | |
__int64 rowCount = 0; | |
__int64 actualRowCount = 0; | |
__int64 diskByteCount = 0; | |
__int64 compressedByteCount = 0; | |
for(const auto it: *indexes) | |
{ | |
for (EWAHBoolArray<uint32_t>::const_iterator it2 = it.second.begin(); it2 != it.second.end(); ++it2) | |
{ | |
actualRowCount++; | |
} | |
rowCount += it.second.numberOfOnes(); | |
diskByteCount += it.second.sizeOnDisk(); | |
compressedByteCount += it.second.computeStatistics().getCompressedSize(); | |
} | |
cout << "TOTAL CAPACITY: " << rows->capacity() << endl; // 28 | |
cout << "TOTAL ROWS: " << rows->size() << endl; // 24 | |
cout << "ACTUAL ROWS: " << actualRowCount << endl; // 26 ?? Why is this wrong? | |
cout << "INDEXED ROWS: " << rowCount << endl; // 24 | |
cout << "TERMS: " << indexes->size() << endl; // 6 | |
cout << "DISK BYTES: " << diskByteCount << endl; // 144 | |
cout << "COMPRESSED BYTES: " << compressedByteCount << endl; // 12 | |
} | |
getchar(); | |
return 0; | |
} |
Please see a simplified example at
https://github.com/lemire/EWAHBoolArray/blob/master/example2.cpp
At the end of the file, you will see the output that I get. As you can see, I observe no discrepancy.
You seem to be using a Microsoft compiler: not that I do not have access to a Microsoft compiler and I rely on contributions to help me fix bugs for Microsoft compilers. I only actively support GNU GCC and CLang as these are the compilers I use.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
main's rows is still in scope at getchar(). Try wrapping the code above in another pair of braces to force the scope to end and the pointer to destruct.