You probably want to put this in your repo and run it via npx and tsx as such:
First make sure you have the dependencies with npm/yarn/pnpm. You need ignore, lodash and yargs
npx tsx src/scripts/serialize-repo.tsCheers! ✨
| Options: | |
| --version Show version number [boolean] | |
| -s, --size Chunk size in megabytes [number] [default: Infinity] | |
| -p, --path Base path to serialize (optional) [string] | |
| --help Show help [boolean] | |
| Examples: | |
| pnpm serialize-repo Serialize entire repository into a | |
| single file | |
| pnpm serialize-repo -s 10 Split repository into 10MB chunks | |
| pnpm serialize-repo -p src/app Serialize only the src/app directory | |
| pnpm serialize-repo -s 5 -p Split src/components into 5MB chunks | |
| src/components |
| /** | |
| * This script reads all text-based files in a Git repository or a specified directory, | |
| * splits them into chunks based on size, and writes those chunks to disk in a structured format. | |
| * It also calculates a checksum to keep track of repository state. | |
| */ | |
| import { execSync } from 'child_process'; | |
| import crypto from 'crypto'; | |
| import fs from 'fs/promises'; | |
| import ignore, { Ignore } from 'ignore'; | |
| import _ from 'lodash'; | |
| import path from 'path'; | |
| import yargs from 'yargs'; | |
| import { hideBin } from 'yargs/helpers'; | |
| /** Logger to output messages (replace with your preferred logger). */ | |
| const logger = console; | |
| /** | |
| * Set of known file extensions that are typically binary. | |
| * Files with these extensions won't be read for textual content. | |
| */ | |
| const BINARY_FILE_EXTENSIONS = new Set([ | |
| '.jpg', '.jpeg', '.png', '.gif', '.ico', '.webp', '.pdf', '.mp4', | |
| '.webm', '.mov', '.mp3', '.wav', '.ttf', '.woff', '.woff2', '.eot', | |
| '.exe', '.dll', '.bin', '.iso', '.img', '.dmg', '.dat', '.sys', | |
| '.so', '.o', '.a', '.lib', '.class', '.jar', '.apk', '.com', | |
| '.elf', '.drv', '.rom', '.vhd', '.vhdx', '.gho', '.efi', '.bpl', | |
| '.cpl', '.ocx', '.scr', '.rco', '.ovl', '.mo', '.nib', '.xap', | |
| '.psf', '.pak', '.img3', '.img4', '.msi', '.cab', '.otf', '.cur', | |
| '.ani', '.swf', '.fla', '.flv', '.mpg', '.mpeg', '.avi', '.wmv', | |
| '.mkv', '.ogg', '.ogv', '.wma', '.mid', '.midi', '.aac', '.flac', | |
| '.bmp', '.psd', '.ai', '.eps', '.raw', '.tif', '.tiff', '.3ds', | |
| '.max', '.obj', '.fbx', '.blend', '.crt', '.key', '.pem', '.der', | |
| '.png2', '.jp2', '.swc', '.mso', '.p12', '.p7b', '.gbr', '.pcb', | |
| '.icns', '.xdf', '.zip', '.rar', '.7z', '.gz', '.tar', '.tgz', | |
| '.bz2', '.xz' | |
| ]); | |
| /** | |
| * Represents a single file entry with its path and content. | |
| */ | |
| interface FileEntry { | |
| /** | |
| * Relative file path. | |
| */ | |
| path: string; | |
| /** | |
| * File content as a string. | |
| */ | |
| content: string; | |
| } | |
| /** | |
| * Narrows any unknown type error to a standard Error object. | |
| * @param error The unknown type to be asserted. | |
| * @throws Throws a new Error if the provided error is not an instance of Error. | |
| */ | |
| function assertError(error: unknown): asserts error is Error { | |
| if (!(error instanceof Error)) throw new Error('Unknown error type'); | |
| } | |
| /** | |
| * Reads the `.gitignore` file if present and returns an Ignore instance. | |
| * @returns An Ignore instance containing rules from `.gitignore`. | |
| */ | |
| async function readGitignore(): Promise<Ignore> { | |
| const ig = ignore(); | |
| try { | |
| const gitignore = await fs.readFile('.gitignore', 'utf-8'); | |
| ig.add(gitignore); | |
| } catch (error) { | |
| assertError(error); | |
| logger.warn('No .gitignore found, proceeding without it:', error.message); | |
| } | |
| return ig; | |
| } | |
| /** | |
| * Checks if a file is likely to be text by extension and by scanning its first 4KB for null bytes. | |
| * @param filePath Absolute path to the file. | |
| * @returns Boolean indicating if the file is text (true) or binary (false). | |
| */ | |
| async function isTextFile(filePath: string): Promise<boolean> { | |
| const ext = path.extname(filePath).toLowerCase(); | |
| if (BINARY_FILE_EXTENSIONS.has(ext)) return false; | |
| try { | |
| const fd = await fs.open(filePath, 'r'); | |
| const buffer = Buffer.alloc(4096); | |
| const { bytesRead } = await fd.read(buffer, 0, 4096, 0); | |
| await fd.close(); | |
| for (let i = 0; i < bytesRead; i++) { | |
| if (buffer[i] === 0) return false; // Null byte detected | |
| } | |
| return true; | |
| } catch (error) { | |
| assertError(error); | |
| logger.error('Error checking file type:', error.message); | |
| return false; | |
| } | |
| } | |
| /** | |
| * Recursively walks through a directory, yielding file paths of text-based files | |
| * while respecting `.gitignore` rules and optional basePath constraints. | |
| * @param dir The directory to walk. | |
| * @param ig Ignore instance for filtering paths. | |
| * @param basePath Optional absolute root path for restricting traversal. | |
| * @param base Internal parameter for forming relative paths. | |
| */ | |
| async function* walkDirectory( | |
| dir: string, | |
| ig: Ignore, | |
| basePath?: string, | |
| base = '', | |
| ): AsyncGenerator<string> { | |
| const entries = await fs.readdir(dir, { withFileTypes: true }); | |
| for (const entry of entries) { | |
| const relativePath = path.join(base, entry.name); | |
| const fullPath = path.join(dir, entry.name); | |
| // Skip if path is outside basePath | |
| if (basePath && !fullPath.startsWith(basePath)) continue; | |
| // Skip paths ignored by .gitignore | |
| if (ig.ignores(relativePath)) continue; | |
| if (entry.isDirectory()) { | |
| yield* walkDirectory(fullPath, ig, basePath, relativePath); | |
| } else if (entry.isFile() && (await isTextFile(fullPath))) { | |
| yield fullPath; | |
| } | |
| } | |
| } | |
| /** | |
| * Computes a short hash representing the tracked files in a Git repository. | |
| * Uses Git to list and hash tracked files, then adds chunk size info to the hash. | |
| * @param chunkSize The maximum chunk size in MB. | |
| * @returns A short hash string or a timestamp-based fallback if not in a Git repository. | |
| */ | |
| async function getRepoChecksum(chunkSize: number): Promise<string> { | |
| try { | |
| const trackedFiles = execSync('git ls-files -c --exclude-standard') | |
| .toString() | |
| .trim() | |
| .split('\n') | |
| .sort(); | |
| const hash = crypto.createHash('sha256'); | |
| for (const file of trackedFiles) { | |
| try { | |
| const fileHash = execSync(`git hash-object "${file}"`).toString().trim(); | |
| hash.update(`${file}:${fileHash}\n`); | |
| } catch (error) { | |
| assertError(error); | |
| // Skip files that can't be hashed | |
| continue; | |
| } | |
| } | |
| // Include chunkSize to differentiate different chunk settings | |
| if (chunkSize !== Infinity) hash.update(chunkSize.toString()); | |
| return hash.digest('hex').slice(0, 8); | |
| } catch (error) { | |
| assertError(error); | |
| // If not a Git repo, return timestamp as fallback | |
| logger.warn('Not a git repository, using timestamp as fallback:', error.message); | |
| return Date.now().toString(36); | |
| } | |
| } | |
| /** | |
| * Writes a collection of FileEntry objects to disk as a single chunk. | |
| * @param files An array of FileEntry objects to write. | |
| * @param index The chunk index for naming the output file. | |
| * @param outputDir The directory to which the chunk file will be written. | |
| */ | |
| async function writeChunk(files: FileEntry[], index: number, outputDir: string): Promise<void> { | |
| const chunk = files.map((file) => `>>>> ${file.path}\n${file.content}`).join('\n\n'); | |
| const outputPath = path.join(outputDir, `chunk-${index}.txt`); | |
| await fs.writeFile(outputPath, chunk, 'utf-8'); | |
| logger.info(`Written chunk ${index} with ${files.length} files`); | |
| } | |
| /** | |
| * Options controlling repository serialization behavior. | |
| */ | |
| interface SerializeOptions { | |
| /** | |
| * Maximum chunk size in megabytes. Use Infinity for a single-chunk output. | |
| */ | |
| chunkSizeMB: number; | |
| /** | |
| * Base path to serialize. Defaults to the current working directory if omitted. | |
| */ | |
| basePath?: string; | |
| } | |
| /** | |
| * Serializes text-based files in a repository or subdirectory into chunks. | |
| * Each chunk is written as a single text file containing multiple file contents. | |
| * @param options The serialization options including chunk size and optional base path. | |
| * @returns The output directory path where all chunk files are stored. | |
| */ | |
| async function serializeRepo(options: SerializeOptions): Promise<string> { | |
| const { chunkSizeMB, basePath } = options; | |
| const checksum = await getRepoChecksum(chunkSizeMB); | |
| const pathSuffix = basePath ? `_${path.basename(basePath)}` : ''; | |
| const dirName = | |
| chunkSizeMB === Infinity | |
| ? `${checksum}${pathSuffix}` | |
| : `${checksum}${pathSuffix}_${chunkSizeMB}mb`; | |
| const outputDir = path.join(process.cwd(), 'repo-serialized', dirName); | |
| await fs.mkdir(outputDir, { recursive: true }); | |
| const ig = await readGitignore(); | |
| const files: FileEntry[] = []; | |
| let currentChunkSize = 0; | |
| let chunkIndex = 0; | |
| const startPath = basePath ? path.resolve(process.cwd(), basePath) : process.cwd(); | |
| for await (const filePath of walkDirectory(startPath, ig, startPath)) { | |
| try { | |
| const content = await fs.readFile(filePath, 'utf-8'); | |
| const fileSize = Buffer.byteLength(content, 'utf-8'); | |
| // If next file exceeds chunk size, write the current chunk first | |
| if (currentChunkSize + fileSize > chunkSizeMB * 1024 * 1024) { | |
| await writeChunk(files, chunkIndex++, outputDir); | |
| files.length = 0; | |
| currentChunkSize = 0; | |
| } | |
| files.push({ | |
| path: path.relative(process.cwd(), filePath), | |
| content, | |
| }); | |
| currentChunkSize += fileSize; | |
| } catch (error) { | |
| assertError(error); | |
| logger.error(`Error processing file ${filePath}:`, error.message); | |
| } | |
| } | |
| // Write any leftover files as the final chunk | |
| if (files.length > 0) { | |
| await writeChunk(files, chunkIndex, outputDir); | |
| } | |
| return outputDir; | |
| } | |
| // Configure command-line options | |
| const argv = yargs(hideBin(process.argv)) | |
| .option('size', { | |
| alias: 's', | |
| type: 'number', | |
| description: 'Chunk size in megabytes', | |
| default: Infinity, | |
| }) | |
| .option('path', { | |
| alias: 'p', | |
| type: 'string', | |
| description: 'Base path to serialize (optional)', | |
| }) | |
| .example('pnpm serialize-repo', 'Serialize entire repository into a single file') | |
| .example('pnpm serialize-repo -s 10', 'Split repository into 10MB chunks') | |
| .example('pnpm serialize-repo -p src/app', 'Serialize only the src/app directory') | |
| .example('pnpm serialize-repo -s 5 -p src/components', 'Split src/components into 5MB chunks') | |
| .check(async (argv) => { | |
| if (isNaN(argv.size) || argv.size <= 0) { | |
| throw new Error('Please provide a valid chunk size in megabytes'); | |
| } | |
| if ( | |
| argv.path && | |
| !(await fs | |
| .access(argv.path) | |
| .then(() => true) | |
| .catch(() => false)) | |
| ) { | |
| throw new Error('Provided path does not exist'); | |
| } | |
| return true; | |
| }) | |
| .help().argv; | |
| /** | |
| * Main entry point. Parses command-line options, then serializes the repository. | |
| */ | |
| async function main() { | |
| const { size, path: basePath } = await argv; | |
| logger.info( | |
| `Serializing repo from ${basePath || 'root'} ${ | |
| size !== Infinity ? ` with chunk size ${size}MB` : '' | |
| }` | |
| ); | |
| const outputDir = await serializeRepo({ chunkSizeMB: size, basePath }); | |
| logger.info(`✨ Repository serialized successfully!`); | |
| if (size !== Infinity) { | |
| const files = await fs.readdir(outputDir); | |
| logger.info(`Generated chunks:`); | |
| for (const file of files) { | |
| logger.info(path.join(outputDir, file)); | |
| } | |
| } else { | |
| logger.info(`Outputed file:`); | |
| logger.info(path.join(outputDir, 'chunk-0.txt')); | |
| } | |
| } | |
| // Execute main | |
| void main(); |