Last active
September 2, 2024 05:12
-
-
Save daweimau/6cb7f3a81742e7b03524133651b1ecc6 to your computer and use it in GitHub Desktop.
Run this to get the `split_csv` command. Usage: `split_csv your_file_name.csv 3000` where `3000` is optional batch size (defaults to 2000)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mkdir -p ~/.scripts && cat << 'EOF' > ~/.scripts/csv_splitter.sh | |
#!/bin/bash | |
# Check if a filename was provided | |
if [ -z "$1" ]; then | |
echo "Usage: $0 filename.csv [maxrows]" | |
exit 1 | |
fi | |
# Input file from argument | |
input_file="$1" | |
# Max rows per batch (default to 2000 if not provided) | |
batch_size=${2:-2000} | |
# Initial starting line for data (skip header) | |
start=2 | |
# Count of parts (used for naming output files) | |
part=1 | |
# Get the total number of lines in the file | |
total_lines=$(grep -c '' "$input_file") | |
# Calculate the number of data rows | |
data_rows=$(($total_lines - 1)) | |
# Print the original number of data rows | |
echo "Original $input_file has $data_rows data rows" | |
# Extract the header | |
header=$(sed -n '1p' "$input_file") | |
while [ $start -le $total_lines ]; do | |
# Calculate end line for this batch | |
end=$(($start + $batch_size - 1)) | |
# If the end line is greater than total lines, adjust it | |
if [ $end -gt $total_lines ]; then | |
end=$total_lines | |
fi | |
# Calculate the number of data rows in this part | |
rows_in_part=$(($end - $start + 1)) | |
# Output file name | |
output_file="${input_file%.csv}_part_$part.csv" | |
# Extract lines and prepend the header | |
echo "$header" > "$output_file" | |
sed -n "$start,$end"p "$input_file" >> "$output_file" | |
# For all files except the last `sed` adds a newline at the end. Remove it. | |
if [ $end -lt $total_lines ]; then | |
truncate -s -2 "$output_file" | |
fi | |
# Print the number of rows in the created file | |
echo "Created $output_file with $rows_in_part data rows" | |
# Increment the start line and part number | |
start=$(($end + 1)) | |
part=$(($part + 1)) | |
done | |
echo "Split complete" | |
EOF | |
chmod +x ~/.scripts/csv_splitter.sh | |
echo "alias split_csv='~/.scripts/csv_splitter.sh'" >> ~/.zshrc | |
source ~/.zshrc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment