ckandoth · September 24, 2021 23:53
diff --git a/prep_grch38_ref.txt b/prep_grch38_ref.txt
 # Prepare a conda environment with tools we will need:
 mamba create -n ref; conda activate ref
 mamba install -y -c bioconda htslib==1.13 bcftools==1.13 samtools==1.13 picard-slim==2.26.2 bwa-mem2==2.2.1 bwa==0.7.17 gsutil==4.68

 # Fetch the alignment-ready human reference FASTA and index:
 gsutil -m cp gs://genomics-public-data/references/GRCh38_Verily/GRCh38_Verily_v1.genome.fa{,.fai} .

 # Index the reference FASTA for use with various tools:
 picard CreateSequenceDictionary -R GRCh38_Verily_v1.genome.fa
 bwa-mem2 index GRCh38_Verily_v1.genome.fa
 bwa index GRCh38_Verily_v1.genome.fa

 # Fetch the dbSNP VCF, convert RefSeq Accession IDs to chromosome names, and keep only the CLNORIGIN info:
 curl -LO https://ftp.ncbi.nih.gov/snp/archive/b155/VCF/GCF_000001405.39.gz
 tabix -p vcf GCF_000001405.39.gz
 curl -sL https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_report.txt | grep -v ^# | cut -f7,10 | grep -wv na > GCF_000001405.39.acc_ids.txt
 bcftools annotate --threads 8 --remove ^INF/CLNORIGIN --rename-chrs GCF_000001405.39.acc_ids.txt --output-type z --output dbsnp_b155_grch38_all.vcf.gz GCF_000001405.39.gz
 tabix -p vcf dbsnp_b155_grch38_all.vcf.gz

 # Generate a smaller dbSNP VCF listing only SNPs in autosomes, X, Y, and MT:
 bcftools view --threads 8 --types snps --regions chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM --output-type z --output-file dbsnp_b155_grch38_snps.vcf.gz dbsnp_b155_grch38_all.vcf.gz
 tabix -p vcf dbsnp_b155_grch38_snps.vcf.gz

 # Fetch the WES/WGS panel-of-normals generated using GATK on 1000genomes data:
 gsutil -m cp gs://gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz{,.tbi} .

 # Fetch the WES/WGS gnomAD 2 VCF for use as a germline resource with MuTect2:
 gsutil -m cp gs://gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz{,.tbi} .
	# Prepare a conda environment with tools we will need:
	mamba create -n ref; conda activate ref
	mamba install -y -c bioconda htslib==1.13 bcftools==1.13 samtools==1.13 picard-slim==2.26.2 bwa-mem2==2.2.1 bwa==0.7.17 gsutil==4.68

	# Fetch the alignment-ready human reference FASTA and index:
	gsutil -m cp gs://genomics-public-data/references/GRCh38_Verily/GRCh38_Verily_v1.genome.fa{,.fai} .

	# Index the reference FASTA for use with various tools:
	picard CreateSequenceDictionary -R GRCh38_Verily_v1.genome.fa
	bwa-mem2 index GRCh38_Verily_v1.genome.fa
	bwa index GRCh38_Verily_v1.genome.fa

	# Fetch the dbSNP VCF, convert RefSeq Accession IDs to chromosome names, and keep only the CLNORIGIN info:
	curl -LO https://ftp.ncbi.nih.gov/snp/archive/b155/VCF/GCF_000001405.39.gz
	tabix -p vcf GCF_000001405.39.gz
	curl -sL https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_report.txt \| grep -v ^# \| cut -f7,10 \| grep -wv na > GCF_000001405.39.acc_ids.txt
	bcftools annotate --threads 8 --remove ^INF/CLNORIGIN --rename-chrs GCF_000001405.39.acc_ids.txt --output-type z --output dbsnp_b155_grch38_all.vcf.gz GCF_000001405.39.gz
	tabix -p vcf dbsnp_b155_grch38_all.vcf.gz

	# Generate a smaller dbSNP VCF listing only SNPs in autosomes, X, Y, and MT:
	bcftools view --threads 8 --types snps --regions chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM --output-type z --output-file dbsnp_b155_grch38_snps.vcf.gz dbsnp_b155_grch38_all.vcf.gz
	tabix -p vcf dbsnp_b155_grch38_snps.vcf.gz

	# Fetch the WES/WGS panel-of-normals generated using GATK on 1000genomes data:
	gsutil -m cp gs://gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz{,.tbi} .

	# Fetch the WES/WGS gnomAD 2 VCF for use as a germline resource with MuTect2:
	gsutil -m cp gs://gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz{,.tbi} .