Created
August 24, 2022 20:34
-
-
Save abh006/05d0171c4f8a378dadf795549bfe877e to your computer and use it in GitHub Desktop.
Splits Huge JSON Array into chunked files, with each line a JSON object
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <vector> | |
#include <stack> | |
using std::cout; using std::cerr; | |
using std::endl; using std::string; | |
using std::ifstream; using std::vector; | |
using std::stack; | |
using std::to_string; | |
string outFilePath("/Users/hjpotter/output"); | |
string outFilePrefix("out_"); | |
string getFileName(int chunkCount){ | |
return outFilePath + "/" + outFilePrefix + to_string(chunkCount) + ".txt"; | |
} | |
int main() | |
{ | |
int batchSize = 1000; | |
string filename("/Users/hjpotter/huge-json-array.json"); | |
vector<char> bytes; | |
std::ofstream outfile; | |
FILE* input_file = fopen(filename.c_str(), "r"); | |
if (input_file == nullptr) { | |
return EXIT_FAILURE; | |
} | |
stack<unsigned char> st; | |
string json_str = ""; | |
int chunkCount = 44; | |
int lineCount = 0; | |
outfile.open( getFileName(chunkCount), std::ios_base::app); // append instead of overwrite | |
unsigned char character = 0; | |
bool startSkipped = false; | |
while (!feof(input_file)) { | |
character = getc(input_file); | |
if(!startSkipped){ | |
startSkipped = true; | |
continue; | |
} | |
if(st.empty() && character != '{'){ | |
// Next JSON not started yet. Skipping intermediate commas and spaces | |
continue; | |
} | |
json_str += character; | |
if(character == '{'){ | |
st.push(character); | |
}else if(character == '}'){ | |
st.pop(); | |
} | |
if(st.empty()){ | |
outfile << json_str <<endl; | |
json_str = ""; | |
lineCount++; | |
} | |
if(lineCount >= batchSize){ | |
cout << "Closing current file and reopening next. File count: " << chunkCount << endl; | |
outfile.close(); | |
chunkCount++; | |
lineCount = 0; | |
outfile.open( getFileName(chunkCount), std::ios_base::app); // append instead of overwrite | |
} | |
} | |
if(outfile.is_open()){ | |
outfile.close(); | |
} | |
cout << "Finished" << endl; | |
fclose(input_file); | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment