Last active
May 28, 2018 21:57
-
-
Save tgirke/e0b645fb4e0f0b590fed63925558da38 to your computer and use it in GitHub Desktop.
Map Pfam domain with HMMER3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################## | |
## Map Pfam Domains to Proteins with HMMER3 ## | |
############################################## | |
## Author: Thomas Girke | |
## Date: May 11, 2018 | |
## Utility: mapping of Pfam domains to protein sequences. | |
## The module load and Pfam database paths given below are specific to the HPCC/biocluster system. | |
## For details consult the man page for hmmscan from the command-line with 'hmmscan -h' | |
library(systemPipeR) | |
moduleload("hmmer/3.1b2") | |
## Note: the file myseq.txt contains yours protein sequences and Pfam-A.hmm is the Pfam database available on HPCC cluster | |
system("hmmscan -E 0.1 --acc --domtblout output.pfam /srv/projects/db/pfam/2017-06-11-Pfam31.0/Pfam-A.hmm myseq.txt > /dev/null") | |
## Read Pfam mapping results into R | |
pfdf <- read.table("output.pfam", comment="#", header=FALSE) | |
## For fast import use fread from data.table package | |
library(data.table) | |
pfdf <- fread("grep -v '^#' output.pfam") # syntax to ignore lines starting with `#` | |
colnames(pfdf) <- c("target name", "accession1", "tlen", "query name", "accession2", "qlen", "E-value", "score1", "bias1", "No", "of", "c-Evalue", "i-Evalue", "score2", "bias2", "from1", "to`", "from2", "to2", "from3", "to3", "acc", "description of target") | |
## tibbles from dplyr are a more modern version of data.frames | |
library(dplyr, quietly=TRUE) | |
pfdf <- as_data_frame(pfdf) | |
pfdf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment