Last active
October 16, 2025 03:06
-
-
Save ckandoth/4006866209475ae558ead88a53e6b59f to your computer and use it in GitHub Desktop.
Use an Azure NP10 Dragen PAYG server to process FASTQs in blob storage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -uo pipefail | |
# Clean up sentinel files from prior run if any | |
rm -f /tmp/.failure /tmp/.success | |
error() { | |
echo "Error: $1" >&2 | |
touch /tmp/.failure # Signals failure to the orchestrator | |
exit 1 | |
} | |
if (( $# != 3 )); then | |
cat >&2 << EOM_USAGE | |
Usage: ./process_fastqs.sh [FASTQ_BLOB_DIR] [REF_BLOB_DIR] [OUTPUT_BLOB_DIR] | |
Purpose: Process a single sample whose FASTQs are stored in a blob storage folder, and upload results back to blob storage | |
Command-line arguments: | |
FASTQ_BLOB_DIR - e.g. "fqs/ajtrio/son" where "fqs" is the container and "ajtrio/son" is the subfolder containing FASTQs to process | |
REF_BLOB_DIR - e.g. "ref/hg38" where "ref" is the container and "hg38" is the subfolder containing Dragen reference data as a tar file | |
OUTPUT_BLOB_DIR - e.g. "dgn/ajtrio/son" where "dgn" is the container, "son" is the sample name, and "ajtrio/son" is the subfolder for outputs | |
Environment variables: | |
AZURE_STORAGE_ACCOUNT - the name of the ADLS Gen2 storage account we will use | |
AZURE_STORAGE_KEY - one of the keys returned by "az storage account keys list" | |
EOM_USAGE | |
touch /tmp/.failure | |
exit 1 | |
fi | |
if [[ -z "${AZURE_STORAGE_ACCOUNT:-}" || -z "${AZURE_STORAGE_KEY:-}" ]]; then | |
error "AZURE_STORAGE_ACCOUNT and/or AZURE_STORAGE_KEY environment variables are not set" | |
fi | |
FASTQ_BLOB_DIR=$1 | |
REF_BLOB_DIR=$2 | |
OUTPUT_BLOB_DIR=$3 | |
echo "Processing ${FASTQ_BLOB_DIR} on VM $(hostname)" | |
# Piece together the full blob storage URLs we will need | |
STORAGE_ACCT_ENDPOINT=https://${AZURE_STORAGE_ACCOUNT}.blob.core.windows.net | |
FASTQ_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${FASTQ_BLOB_DIR}" | |
REF_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${REF_BLOB_DIR}" | |
OUTPUT_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${OUTPUT_BLOB_DIR}" | |
# Format the local disk with xfs for better handling of large files and to clean up data from a prior run of this script | |
PARTITION=$(findmnt -no SOURCE /mnt) | |
sudo umount /mnt | |
sudo mkfs.xfs -qf $PARTITION | |
sudo mount $PARTITION /mnt | |
sudo chown -R $USER:$GROUPS /mnt || error "Failed to change ownership of /mnt" | |
# Create some directories we will need if they don't already exist | |
mkdir -p /mnt/{fqs,dgn} || error "Failed to create directories in /mnt" | |
mkdir -p /tmp/{ref,dgn} || error "Failed to create directories in /tmp" | |
# Download reference data into /tmp if it wasn't already downloaded by a previous run of this script | |
REF_DIR="/tmp/${REF_BLOB_DIR}" | |
if [[ ! -d "${REF_DIR}" ]]; then | |
echo "Downloading reference data into ${REF_DIR}..." | |
REF_FS=$(echo "${REF_BLOB_DIR}" | cut -f1 -d/) | |
SAS_EXPIRY=$(date -u -d "10 mins" '+%Y-%m-%dT%H:%MZ') | |
REF_SAS=$(az storage container generate-sas --name ${REF_FS} --permissions lr --expiry ${SAS_EXPIRY} --https-only -o tsv) | |
azcopy cp "${REF_BLOB_URL}/*?${REF_SAS}" ${REF_DIR} --recursive --output-level=quiet || error "Failed to download reference data" | |
else | |
echo "Reusing existing reference data under ${REF_DIR}..." | |
fi | |
# Download FASTQs and parse headers to make a FASTQ list for use with Dragen | |
FQS_DIR="/mnt/${FASTQ_BLOB_DIR}" | |
SAMPLE=$(basename $OUTPUT_BLOB_DIR) | |
echo "Downloading FASTQs and creating a FASTQ list for Dragen under ${FQS_DIR}..." | |
FASTQ_FS=$(echo "${FASTQ_BLOB_DIR}" | cut -f1 -d/) | |
SAS_EXPIRY=$(date -u -d "20 mins" '+%Y-%m-%dT%H:%MZ') | |
FASTQ_SAS=$(az storage container generate-sas --name ${FASTQ_FS} --permissions lr --expiry ${SAS_EXPIRY} --https-only -o tsv) | |
azcopy cp "${FASTQ_BLOB_URL}/*?${FASTQ_SAS}" ${FQS_DIR} --output-level=quiet || error "Failed to download FASTQs" | |
FASTQ_LIST="${FQS_DIR}/fastq_list.csv" | |
echo "RGID,RGSM,RGLB,Lane,Read1File,Read2File" > ${FASTQ_LIST} | |
for fq1 in ${FQS_DIR}/*_R1*.fastq.gz; do | |
HEADER=$(gzip -dc "$fq1" | head -n1) | |
FLOWCELL=$(echo "$HEADER" | cut -f3 -d:) | |
LANE=$(echo "$HEADER" | cut -f4 -d:) | |
RGID="${FLOWCELL}.${LANE}.${SAMPLE}" | |
fq2=$(echo "$fq1" | sed 's/_R1/_R2/') | |
if [[ -f "${fq2}" ]]; then | |
echo "$RGID,$SAMPLE,UnknownLibrary,$LANE,$fq1,$fq2" >> ${FASTQ_LIST} | |
else | |
error "Could not find R2 FASTQ for $fq1" | |
fi | |
done | |
# Locate the population SNP VCF for use by the ASCN caller to measure B-allele counts | |
ASCN_SNP_VCF=$(find ${REF_DIR} -name "*.ascn.snps.vcf.gz") | |
echo "Running Dragen on sample ${SAMPLE} using ${FASTQ_LIST}..." | |
OUTPUT_DIR="/mnt/${OUTPUT_BLOB_DIR}" | |
mkdir -p $OUTPUT_DIR | |
dragen --lic-credentials ~/dragen_lic.cfg --intermediate-results-dir /tmp/dgn --ref-dir "${REF_DIR}" --enable-map-align true --enable-map-align-output true --output-format CRAM --enable-duplicate-marking true --generate-sa-tags true --enable-sort true --qc-coverage-ignore-overlaps true --enable-variant-caller true --vc-emit-ref-confidence GVCF --vc-compact-gvcf true --vc-enable-vcf-output true --vc-combine-phased-variants-distance 6 --enable-targeted true --targeted-merge-vc true --enable-sv true --enable-cnv true --cnv-population-b-allele-vcf "${ASCN_SNP_VCF}" --cnv-enable-cyto-output true --cnv-enable-mosaic-calling true --cnv-interval-width 1000 --cnv-enable-self-normalization true --cnv-enable-gcbias-correction true --cnv-counts-method start --cnv-enable-segdups-extension true --cnv-enable-tracks false --enable-hla true --enable-star-allele true --enable-pgx true --repeat-genotype-enable true --enable-mrjd true --mrjd-enable-high-sensitivity-mode true --fastq-list "${FASTQ_LIST}" --fastq-list-sample-id "${SAMPLE}" --output-directory "${OUTPUT_DIR}" --output-file-prefix "${SAMPLE}" || error "Dragen run failed." | |
echo "Dragen run successful. Uploading outputs to ${OUTPUT_BLOB_URL}..." | |
OUT_FS=$(echo "${OUTPUT_BLOB_DIR}" | cut -f1 -d/) | |
SAS_EXPIRY=$(date -u -d "20 mins" '+%Y-%m-%dT%H:%MZ') | |
OUTPUT_SAS=$(az storage container generate-sas --name ${OUT_FS} --permissions cw --expiry ${SAS_EXPIRY} --https-only -o tsv) | |
azcopy cp "${OUTPUT_DIR}/*" "${OUTPUT_BLOB_URL}?${OUTPUT_SAS}" --output-level=quiet || error "Failed to upload outputs" | |
# Cleanup and signal completion to the orchestrator | |
rm -rf /tmp/dgn || error "Failed to delete /tmp/dgn" | |
touch /tmp/.success | |
echo "Finished with sample ${SAMPLE}." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment