Created
September 24, 2021 23:53
-
-
Save ckandoth/79d831d08252599d7af6a019c410b572 to your computer and use it in GitHub Desktop.
Download and prepare GRCh38 reference data useful in NGS analyses
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Prepare a conda environment with tools we will need: | |
mamba create -n ref; conda activate ref | |
mamba install -y -c bioconda htslib==1.13 bcftools==1.13 samtools==1.13 picard-slim==2.26.2 bwa-mem2==2.2.1 bwa==0.7.17 gsutil==4.68 | |
# Fetch the alignment-ready human reference FASTA and index: | |
gsutil -m cp gs://genomics-public-data/references/GRCh38_Verily/GRCh38_Verily_v1.genome.fa{,.fai} . | |
# Index the reference FASTA for use with various tools: | |
picard CreateSequenceDictionary -R GRCh38_Verily_v1.genome.fa | |
bwa-mem2 index GRCh38_Verily_v1.genome.fa | |
bwa index GRCh38_Verily_v1.genome.fa | |
# Fetch the dbSNP VCF, convert RefSeq Accession IDs to chromosome names, and keep only the CLNORIGIN info: | |
curl -LO https://ftp.ncbi.nih.gov/snp/archive/b155/VCF/GCF_000001405.39.gz | |
tabix -p vcf GCF_000001405.39.gz | |
curl -sL https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_report.txt | grep -v ^# | cut -f7,10 | grep -wv na > GCF_000001405.39.acc_ids.txt | |
bcftools annotate --threads 8 --remove ^INF/CLNORIGIN --rename-chrs GCF_000001405.39.acc_ids.txt --output-type z --output dbsnp_b155_grch38_all.vcf.gz GCF_000001405.39.gz | |
tabix -p vcf dbsnp_b155_grch38_all.vcf.gz | |
# Generate a smaller dbSNP VCF listing only SNPs in autosomes, X, Y, and MT: | |
bcftools view --threads 8 --types snps --regions chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM --output-type z --output-file dbsnp_b155_grch38_snps.vcf.gz dbsnp_b155_grch38_all.vcf.gz | |
tabix -p vcf dbsnp_b155_grch38_snps.vcf.gz | |
# Fetch the WES/WGS panel-of-normals generated using GATK on 1000genomes data: | |
gsutil -m cp gs://gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz{,.tbi} . | |
# Fetch the WES/WGS gnomAD 2 VCF for use as a germline resource with MuTect2: | |
gsutil -m cp gs://gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz{,.tbi} . |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment