Skip to content

Instantly share code, notes, and snippets.

@ckandoth
Last active October 16, 2025 03:06
Show Gist options
  • Save ckandoth/4006866209475ae558ead88a53e6b59f to your computer and use it in GitHub Desktop.
Save ckandoth/4006866209475ae558ead88a53e6b59f to your computer and use it in GitHub Desktop.
Use an Azure NP10 Dragen PAYG server to process FASTQs in blob storage
#!/bin/bash
set -uo pipefail
# Clean up sentinel files from prior run if any
rm -f /tmp/.failure /tmp/.success
error() {
echo "Error: $1" >&2
touch /tmp/.failure # Signals failure to the orchestrator
exit 1
}
if (( $# != 3 )); then
cat >&2 << EOM_USAGE
Usage: ./process_fastqs.sh [FASTQ_BLOB_DIR] [REF_BLOB_DIR] [OUTPUT_BLOB_DIR]
Purpose: Process a single sample whose FASTQs are stored in a blob storage folder, and upload results back to blob storage
Command-line arguments:
FASTQ_BLOB_DIR - e.g. "fqs/ajtrio/son" where "fqs" is the container and "ajtrio/son" is the subfolder containing FASTQs to process
REF_BLOB_DIR - e.g. "ref/hg38" where "ref" is the container and "hg38" is the subfolder containing Dragen reference data as a tar file
OUTPUT_BLOB_DIR - e.g. "dgn/ajtrio/son" where "dgn" is the container, "son" is the sample name, and "ajtrio/son" is the subfolder for outputs
Environment variables:
AZURE_STORAGE_ACCOUNT - the name of the ADLS Gen2 storage account we will use
AZURE_STORAGE_KEY - one of the keys returned by "az storage account keys list"
EOM_USAGE
touch /tmp/.failure
exit 1
fi
if [[ -z "${AZURE_STORAGE_ACCOUNT:-}" || -z "${AZURE_STORAGE_KEY:-}" ]]; then
error "AZURE_STORAGE_ACCOUNT and/or AZURE_STORAGE_KEY environment variables are not set"
fi
FASTQ_BLOB_DIR=$1
REF_BLOB_DIR=$2
OUTPUT_BLOB_DIR=$3
echo "Processing ${FASTQ_BLOB_DIR} on VM $(hostname)"
# Piece together the full blob storage URLs we will need
STORAGE_ACCT_ENDPOINT=https://${AZURE_STORAGE_ACCOUNT}.blob.core.windows.net
FASTQ_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${FASTQ_BLOB_DIR}"
REF_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${REF_BLOB_DIR}"
OUTPUT_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${OUTPUT_BLOB_DIR}"
# Format the local disk with xfs for better handling of large files and to clean up data from a prior run of this script
PARTITION=$(findmnt -no SOURCE /mnt)
sudo umount /mnt
sudo mkfs.xfs -qf $PARTITION
sudo mount $PARTITION /mnt
sudo chown -R $USER:$GROUPS /mnt || error "Failed to change ownership of /mnt"
# Create some directories we will need if they don't already exist
mkdir -p /mnt/{fqs,dgn} || error "Failed to create directories in /mnt"
mkdir -p /tmp/{ref,dgn} || error "Failed to create directories in /tmp"
# Download reference data into /tmp if it wasn't already downloaded by a previous run of this script
REF_DIR="/tmp/${REF_BLOB_DIR}"
if [[ ! -d "${REF_DIR}" ]]; then
echo "Downloading reference data into ${REF_DIR}..."
REF_FS=$(echo "${REF_BLOB_DIR}" | cut -f1 -d/)
SAS_EXPIRY=$(date -u -d "10 mins" '+%Y-%m-%dT%H:%MZ')
REF_SAS=$(az storage container generate-sas --name ${REF_FS} --permissions lr --expiry ${SAS_EXPIRY} --https-only -o tsv)
azcopy cp "${REF_BLOB_URL}/*?${REF_SAS}" ${REF_DIR} --recursive --output-level=quiet || error "Failed to download reference data"
else
echo "Reusing existing reference data under ${REF_DIR}..."
fi
# Download FASTQs and parse headers to make a FASTQ list for use with Dragen
FQS_DIR="/mnt/${FASTQ_BLOB_DIR}"
SAMPLE=$(basename $OUTPUT_BLOB_DIR)
echo "Downloading FASTQs and creating a FASTQ list for Dragen under ${FQS_DIR}..."
FASTQ_FS=$(echo "${FASTQ_BLOB_DIR}" | cut -f1 -d/)
SAS_EXPIRY=$(date -u -d "20 mins" '+%Y-%m-%dT%H:%MZ')
FASTQ_SAS=$(az storage container generate-sas --name ${FASTQ_FS} --permissions lr --expiry ${SAS_EXPIRY} --https-only -o tsv)
azcopy cp "${FASTQ_BLOB_URL}/*?${FASTQ_SAS}" ${FQS_DIR} --output-level=quiet || error "Failed to download FASTQs"
FASTQ_LIST="${FQS_DIR}/fastq_list.csv"
echo "RGID,RGSM,RGLB,Lane,Read1File,Read2File" > ${FASTQ_LIST}
for fq1 in ${FQS_DIR}/*_R1*.fastq.gz; do
HEADER=$(gzip -dc "$fq1" | head -n1)
FLOWCELL=$(echo "$HEADER" | cut -f3 -d:)
LANE=$(echo "$HEADER" | cut -f4 -d:)
RGID="${FLOWCELL}.${LANE}.${SAMPLE}"
fq2=$(echo "$fq1" | sed 's/_R1/_R2/')
if [[ -f "${fq2}" ]]; then
echo "$RGID,$SAMPLE,UnknownLibrary,$LANE,$fq1,$fq2" >> ${FASTQ_LIST}
else
error "Could not find R2 FASTQ for $fq1"
fi
done
# Locate the population SNP VCF for use by the ASCN caller to measure B-allele counts
ASCN_SNP_VCF=$(find ${REF_DIR} -name "*.ascn.snps.vcf.gz")
echo "Running Dragen on sample ${SAMPLE} using ${FASTQ_LIST}..."
OUTPUT_DIR="/mnt/${OUTPUT_BLOB_DIR}"
mkdir -p $OUTPUT_DIR
dragen --lic-credentials ~/dragen_lic.cfg --intermediate-results-dir /tmp/dgn --ref-dir "${REF_DIR}" --enable-map-align true --enable-map-align-output true --output-format CRAM --enable-duplicate-marking true --generate-sa-tags true --enable-sort true --qc-coverage-ignore-overlaps true --enable-variant-caller true --vc-emit-ref-confidence GVCF --vc-compact-gvcf true --vc-enable-vcf-output true --vc-combine-phased-variants-distance 6 --enable-targeted true --targeted-merge-vc true --enable-sv true --enable-cnv true --cnv-population-b-allele-vcf "${ASCN_SNP_VCF}" --cnv-enable-cyto-output true --cnv-enable-mosaic-calling true --cnv-interval-width 1000 --cnv-enable-self-normalization true --cnv-enable-gcbias-correction true --cnv-counts-method start --cnv-enable-segdups-extension true --cnv-enable-tracks false --enable-hla true --enable-star-allele true --enable-pgx true --repeat-genotype-enable true --enable-mrjd true --mrjd-enable-high-sensitivity-mode true --fastq-list "${FASTQ_LIST}" --fastq-list-sample-id "${SAMPLE}" --output-directory "${OUTPUT_DIR}" --output-file-prefix "${SAMPLE}" || error "Dragen run failed."
echo "Dragen run successful. Uploading outputs to ${OUTPUT_BLOB_URL}..."
OUT_FS=$(echo "${OUTPUT_BLOB_DIR}" | cut -f1 -d/)
SAS_EXPIRY=$(date -u -d "20 mins" '+%Y-%m-%dT%H:%MZ')
OUTPUT_SAS=$(az storage container generate-sas --name ${OUT_FS} --permissions cw --expiry ${SAS_EXPIRY} --https-only -o tsv)
azcopy cp "${OUTPUT_DIR}/*" "${OUTPUT_BLOB_URL}?${OUTPUT_SAS}" --output-level=quiet || error "Failed to upload outputs"
# Cleanup and signal completion to the orchestrator
rm -rf /tmp/dgn || error "Failed to delete /tmp/dgn"
touch /tmp/.success
echo "Finished with sample ${SAMPLE}."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment