/* Testing read file speed for the three read functions from http://cpp.indi.frih.net/blog/2014/09/how-to-read-an-entire-file-into-memory-in-cpp/ compile with -std=c++11 */ #include <type_traits> #include <ostream> #include <sstream> #include <limits> #include <array> #include <vector> #include <deque> /* go to method for small files (<100K) auto ss = std::ostringstream{}; ss << in.rdbuf(); auto s = ss.str(); Problems: has to copy data from the ostringstream into a string and for large data this could be an issue due to having two copies of large data in memory. */ template <typename CharT, typename Traits = std::char_traits<CharT>, typename Allocator = std::allocator<CharT> > std::basic_string<CharT, Traits, Allocator> read_stream_into_string( std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) { std::basic_ostringstream<CharT, Traits, Allocator> ss( std::basic_string<CharT, Traits, Allocator>(std::move(alloc))); if (!(ss << in.rdbuf())) throw std::ios_base::failure{ "error" }; return ss.str(); } /* reading straight into a container: If you are dealing with files it can be faster to count all the characters first, then do one big allocation and one big whopper of a read: auto const start_pos = in.tellg(); in.ignore(std::numeric_limits<std::streamsize>::max()); auto const char_count = in.gcount(); in.seekg(start_pos); auto s = std::string(char_count, char{}); in.read(&s[0], s.size()); in.ignore() is a safe way to count the bytes in a file but means this method requires reading the file twice, once to count bytes and once to read them in. */ template <typename Container = std::string, typename CharT = char, typename Traits = std::char_traits<char> > Container read_stream_into_container( std::basic_istream<CharT, Traits>& in, typename Container::allocator_type alloc = {}) { static_assert( // Allow only strings... std::is_same< Container, std::basic_string<CharT, Traits, typename Container::allocator_type> >::value || // ... and vectors of the plain, signed, and // unsigned flavours of CharT. std::is_same< Container, std::vector<CharT, typename Container::allocator_type> >::value || std::is_same< Container, std::vector<std::make_unsigned<CharT>, typename Container::allocator_type> >::value || std::is_same<Container, std::vector<std::make_signed<CharT>, typename Container::allocator_type> >::value, "only strings and vectors of ((un)signed) CharT allowed"); auto const start_pos = in.tellg(); if (std::streamsize(-1) == start_pos) throw std::ios_base::failure{ "error" }; if (!in.ignore(std::numeric_limits<std::streamsize>::max())) throw std::ios_base::failure{ "error" }; auto const char_count = in.gcount(); if (!in.seekg(start_pos)) throw std::ios_base::failure{ "error" }; auto container = Container(std::move(alloc)); container.resize(char_count); if (0 != container.size()) { if (!in.read(reinterpret_cast<CharT*>(&container[0]), container.size())) throw std::ios_base::failure{ "error" }; } return container; } /* read chunks into a deque: If you’re expecting enormous files (at least several hundreds of megabytes, on average) and you don’t want to seek on stream, read the file in chunks into a deque. Advantage is no copy unless you can't work with the dequeu and end up copying the data out of it. */ template <typename CharT, typename Traits = std::char_traits<CharT>, typename CharO = CharT, typename Allocator = std::allocator<CharO> > std::deque<CharO, Allocator> read_file_into_deque( std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) { static_assert(std::is_same<CharT, CharO>::value || std::is_same<std::make_unsigned<CharT>, CharO>::value || std::is_same<std::make_signed<CharT>, CharO>::value, "char type of deque must be same " "as stream char type " "(possibly signed or unsigned)"); using std::begin; using std::end; auto const chunk_size = std::size_t{ BUFSIZ }; auto container = std::deque<CharO, Allocator>(std::move(alloc)); auto chunk = std::array<CharO, chunk_size>{}; while (in.read(reinterpret_cast<CharT*>(chunk.data()), chunk.size()) || in.gcount()) container.insert(end(container), begin(chunk), begin(chunk) + in.gcount()); return container; } /* Testing section */ #include <chrono> #include <iostream> #include <fstream> #include <stdio.h> #include <cassert> #include <cctype> /* humanize from https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libtcplay/humanize.c */ static const char prefixes[] = " KMGTPE"; template <typename T> std::string humanize(T num) { const char* prefixp; uint64_t i, d; prefixp = prefixes; i = num; d = 0; while ((i > 1024) && (*prefixp != '\0')) { d = (i % 1024) / 10; i /= 1024; ++prefixp; } if (d > 0) return std::to_string(i) + '.' + std::to_string(d) + *prefixp; else return std::to_string(i) + *prefixp; } /* dehumanize_number from http://cvsweb.netbsd.org/bsdweb.cgi/~checkout~/src/lib/libc/gen/dehumanize_number.c */ uint64_t dehumanize_number(const std::string& str) { char unit; size_t delimit; long multiplier; long long tmp, tmp2; size_t ep; size_t len = str.size(); if (str.empty()) { return 0; } multiplier = 1; unit = str[len - 1]; if (std::isalpha((unsigned char)unit)) { switch (std::tolower((unsigned char)unit)) { case 'b': multiplier = 1; break; case 'k': multiplier = 1024; break; case 'm': multiplier = 1024 * 1024; break; case 'g': multiplier = 1024 * 1024 * 1024; break; default: return 0; /* Invalid suffix. */ } delimit = len - 1; } else delimit = 0; tmp = std::stoull(str, &ep, 10); if (str[0] == '\0' || (ep != delimit && str[ep] != '\0')) return 0; /* Not a number. */ tmp2 = tmp * multiplier; tmp2 = tmp2 / multiplier; if (tmp != tmp2) { return 0; /* Out of range. */ } return tmp *= multiplier; } std::string create_empty_file(size_t size) { // dispite a warning tmpnam is portable and there is no risk for this program std::string name = std::tmpnam(nullptr); std::ofstream out(name, std::ofstream::binary); if (!out.seekp(size - 1)) throw std::ios_base::failure{ "error" }; out << 'X'; return name; } template <typename F> bool test(const std::string& test_name, F fn, size_t test_size, bool skip_long_verify) { std::string temp_filename = create_empty_file(test_size); std::ifstream in(temp_filename, std::ifstream::binary); auto start = std::chrono::high_resolution_clock::now(); auto s = fn(in, {}); auto end = std::chrono::high_resolution_clock::now(); remove(temp_filename.c_str()); std::cout << test_name << ":" << std::chrono::duration_cast<std::chrono::milliseconds>( end - start).count() << "ms\n"; // content tests if (s.size() != test_size) { std::cout << "FAILED:wrong size " << s.size() << '\n'; return false; } else if (s[s.size() - 1] != 'X') { std::cout << "FAILED:last byte is wrong\n"; return false; } if (skip_long_verify) { return true; } size_t i = 1; for (; i < s.size() - 1; ++i) { if (s[i]) { std::cout << "FAILED:[" << i << "]!=0\n"; break; } } return (i == s.size() - 1); } bool test_all(size_t test_size, bool skip_long_verify = false) { std::cout << "test_size:" << humanize(test_size) << ", BUFSIZ:" << BUFSIZ << '\n'; bool t1 = test("read_stream_into_string", &read_stream_into_string<char>, test_size, skip_long_verify); bool t2 = test("read_stream_into_container", &read_stream_into_container<std::string>, test_size, skip_long_verify); bool t3 = test("read_file_into_deque", read_file_into_deque<char>, test_size, skip_long_verify); std::cout << std::endl; return (t1 && t2 && t3); } void usage(const std::string& prog_name) { std::cout << "Usage: " << prog_name << " [test_file_size] [max_size step]\n\n" << " Example: " << prog_name << " 1m 100m 500k\n" << " Times read algorithms with files of 1M to 100M in steps of 500K\n" << "Minimum size for files is 2 and using step 0 is troublemaking.\n"; } int main(int argc, char* argv[]) { size_t test_size = dehumanize_number("1M"); if (argc >= 2) { test_size = dehumanize_number(argv[1]); } size_t test_size_max = test_size; size_t step = 1; if (argc == 4) { test_size_max = dehumanize_number(argv[2]); step = dehumanize_number(argv[3]); } if ((test_size <= 1) || (test_size > test_size_max) || (test_size_max <= 1) || (step <= 0)) { usage(argv[0]); return 1; } for (; test_size <= test_size_max && test_all(test_size); test_size += step) { } return 0; }