Skip to content

Instantly share code, notes, and snippets.

@kellabyte
Last active December 15, 2015 08:19
Show Gist options
  • Save kellabyte/5230423 to your computer and use it in GitHub Desktop.
Save kellabyte/5230423 to your computer and use it in GitHub Desktop.
/* SAMPLE CSV FILE */
/*
"Country or Area","Year","Comm. Code","Commodity","Flow","Trade (USD)","Weight (kg)","Quantity Name","Quantity"
"Afghanistan","2011","TOTAL","ALL COMMODITIES","Import","6390310947","","No Quantity",""
"Afghanistan","2011","TOTAL","ALL COMMODITIES","Export","375850935","","No Quantity",""
"Afghanistan","2010","TOTAL","ALL COMMODITIES","Import","5154249867","","No Quantity",""
"Afghanistan","2010","TOTAL","ALL COMMODITIES","Export","388483635","","No Quantity",""
"Afghanistan","2009","TOTAL","ALL COMMODITIES","Import","3336434781","","No Quantity",""
"Afghanistan","2009","TOTAL","ALL COMMODITIES","Export","403441006","","No Quantity",""
"Afghanistan","2008","TOTAL","ALL COMMODITIES","Import","3019860129","","No Quantity",""
"Afghanistan","2008","TOTAL","ALL COMMODITIES","Export","540065594","","No Quantity",""
"Albania","2011","TOTAL","ALL COMMODITIES","Import","5395853069","","No Quantity",""
"Albania","2011","TOTAL","ALL COMMODITIES","Export","1948207305","","No Quantity",""
"Albania","2010","TOTAL","ALL COMMODITIES","Import","4602774967","","No Quantity",""
"Albania","2010","TOTAL","ALL COMMODITIES","Export","1549955724","","No Quantity",""
"Albania","2010","TOTAL","ALL COMMODITIES","Re-Import","26393","","No Quantity",""
"Albania","2009","TOTAL","ALL COMMODITIES","Import","4548287875","","No Quantity",""
"Albania","2009","TOTAL","ALL COMMODITIES","Export","1087914902","","No Quantity",""
"Albania","2009","TOTAL","ALL COMMODITIES","Re-Import","272403","","No Quantity",""
"Albania","2008","TOTAL","ALL COMMODITIES","Import","5250490022","","No Quantity",""
"Albania","2008","TOTAL","ALL COMMODITIES","Export","1354921653","","No Quantity",""
"Albania","2008","TOTAL","ALL COMMODITIES","Re-Export","810868093","","No Quantity",""
"Albania","2008","TOTAL","ALL COMMODITIES","Re-Import","509068","","No Quantity",""
"Albania","2007","TOTAL","ALL COMMODITIES","Import","4200864046","","No Quantity",""
"Albania","2007","TOTAL","ALL COMMODITIES","Export","1077690359","","No Quantity",""
"Albania","2007","TOTAL","ALL COMMODITIES","Re-Import","4494753","","No Quantity",""
*/
#ifdef _MSC_VER
#define _SCL_SECURE_NO_WARNINGS
#endif
#include "stdafx.h"
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <list>
#include <unordered_map>
#include <boost/cstdint.hpp>
#include <boost/timer/timer.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/discrete_distribution.hpp>
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/iostreams/stream.hpp>
#include <boost/unordered_map.hpp>
#include <strtk\strtk.hpp>
#include <EWAHBoolArray/headers/ewah.h>
#include<boost/tokenizer.hpp>
#include <fstream> // fstream
#include <string>
#include <algorithm> // copy
#include <iterator> // ostream_operator
#include "MurmurHash3.h"
using namespace std;
using namespace boost;
using namespace boost::timer;
using namespace boost::random;
using namespace boost::iostreams;
int const rowCount = 1000;
int const indexCount = 100;
std::shared_ptr<vector<vector<string>>> ReadFile()
{
string data("c:\\data.csv");
ifstream in(data.c_str());
if (!in.is_open())
{
return nullptr;
}
typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
boost::escaped_list_separator<char> sep('\\', ',', '\"');
vector<vector<string>> *rowsVector = new vector<vector<string>>();
std::shared_ptr<vector<vector<string>>> rows(rowsVector);
vector<string> vec;
vector<string> headers;
string line;
string buffer;
bool inside_quotes(false);
size_t last_quote(0);
int count = 0;
while (getline(in, buffer))
{
if (count == 25)
{
// Only index 1 CSV header row and 24 actual rows of data.
in.close();
return rows;
}
else if (count > 0)
{
// Deal with line breaks in quoted strings
last_quote = buffer.find_first_of('"');
while (last_quote != string::npos)
{
inside_quotes = !inside_quotes;
last_quote = buffer.find_first_of('"',last_quote+1);
}
line.append(buffer);
if (inside_quotes)
{
line.append("\n");
continue;
}
Tokenizer tok(line, sep);
vec.assign(tok.begin(),tok.end());
line.clear();
rows->push_back(vec);
if (count == 0)
{
// This is the CSV header with the column names.
// Do something with this later.
}
}
count++;
}
size_t wastedMemory = 0;
for (int i = 0; i < rows->size(); ++i)
{
for (int j = 0; j < rows->at(i).size(); ++j)
{
wastedMemory += rows->at(i)[j].capacity() - rows->at(i)[j].size();
}
}
cout << "Wasted memory" << wastedMemory << endl;
in.close();
return rows;
}
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> IndexData(std::shared_ptr<vector<vector<string>>> rows)
{
std::unordered_map<string, EWAHBoolArray<uint32_t>> *indexes = new std::unordered_map<string, EWAHBoolArray<uint32_t>>();
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> indexMap(indexes);
// Index rows columns.
__int64 rowCount = rows->size();
vector<string> columns;
if (rowCount > 0)
{
columns = rows->at(0);
}
for(__int64 i = 0; i < rowCount; i++)
{
auto row = rows->at(i);
__int64 columnSize = row.size();
for(__int64 x = 0; x < columnSize; x++)
{
if (x == 1)
{
// Only index the rows after the CSV header and
// only index the 2nd column (for now).
auto &index = indexes->operator[](row.at(x));
if (index.sizeInBits() == 0)
{
index.set(i);
}
else
{
index.set(i);
}
}
}
}
return indexMap;
}
int _tmain(int argc, _TCHAR* argv[])
{
std::shared_ptr<vector<vector<string>>> rows;
{
cout << "READ ";
auto_cpu_timer timer;
rows = ReadFile();
}
{
std::shared_ptr<std::unordered_map<string, EWAHBoolArray<uint32_t>>> indexes;
{
cout << "INDEX ";
auto_cpu_timer timer;
indexes = IndexData(rows);
}
typedef std::unordered_map<string, EWAHBoolArray<uint32_t>> indexIterator;
__int64 capacityCount = 0;
__int64 rowCount = 0;
__int64 actualRowCount = 0;
__int64 diskByteCount = 0;
__int64 compressedByteCount = 0;
for(const auto it: *indexes)
{
for (EWAHBoolArray<uint32_t>::const_iterator it2 = it.second.begin(); it2 != it.second.end(); ++it2)
{
actualRowCount++;
}
rowCount += it.second.numberOfOnes();
diskByteCount += it.second.sizeOnDisk();
compressedByteCount += it.second.computeStatistics().getCompressedSize();
}
cout << "TOTAL CAPACITY: " << rows->capacity() << endl; // 28
cout << "TOTAL ROWS: " << rows->size() << endl; // 24
cout << "ACTUAL ROWS: " << actualRowCount << endl; // 26 ?? Why is this wrong?
cout << "INDEXED ROWS: " << rowCount << endl; // 24
cout << "TERMS: " << indexes->size() << endl; // 6
cout << "DISK BYTES: " << diskByteCount << endl; // 144
cout << "COMPRESSED BYTES: " << compressedByteCount << endl; // 12
}
getchar();
return 0;
}
@bitcrazed
Copy link

main's rows is still in scope at getchar(). Try wrapping the code above in another pair of braces to force the scope to end and the pointer to destruct.

@lemire
Copy link

lemire commented Mar 27, 2013

Please see a simplified example at

https://github.com/lemire/EWAHBoolArray/blob/master/example2.cpp

At the end of the file, you will see the output that I get. As you can see, I observe no discrepancy.

You seem to be using a Microsoft compiler: not that I do not have access to a Microsoft compiler and I rely on contributions to help me fix bugs for Microsoft compilers. I only actively support GNU GCC and CLang as these are the compilers I use.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment