Last active
April 12, 2020 16:20
-
-
Save mtmorgan/1900a67f4a17f075fa97df00a46e7991 to your computer and use it in GitHub Desktop.
construct distance between GO nodes from OBO file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
obo = readLines("go.obo") | |
## | |
## Clean data | |
## | |
## blank lines separate different 'groups' in the obo | |
group <- cumsum(!nzchar(obo)) | |
## keep groups that have the [Term] label | |
is_term = group[grepl("^\\[Term\\]$", obo)] | |
obo = obo[group %in% is_term] | |
group = group[group %in% is_term] | |
## what have we got? | |
length(unique(group)) # 47426 GO terms | |
unique(sub(":.*", "", obo)) # what keys are there? | |
## only keep 'id' and 'is_a' lines | |
id = obo[startsWith(obo, "id:")] | |
isa = obo[startsWith(obo, "is_a:")] | |
isa_group = group[startsWith(obo, "is_a:")] | |
## keep only GO ids | |
id <- sub(".*(GO:[[:digit:]]+).*", "\\1", id) | |
isa <- sub(".*(GO:[[:digit:]]+).*", "\\1", isa) | |
## create the map... | |
map0 = split(isa, factor(id[isa_group], levels = id)) | |
length(map0) # 47426, same as number of GO terms | |
table(lengths(map0)) # id's have between 0 and 10 'is_a' relations | |
## ...and a data.frame representing an 'unlisted' version of the map | |
map = data.frame( | |
id = rep(names(map0), lengths(map0)), | |
is_a = unlist(map0, use.names = FALSE) | |
) | |
## | |
## Calculate the distance between each id and any other id reachable | |
## via 'is_a' relations | |
## | |
## distance from each term to itself... | |
step = 0L | |
distance = data.frame(term1 = id, term2 = id, step) | |
repeat { | |
## update | |
step = step + 1L | |
## each 'is_a' id has a number 'n' of is_a relations... | |
n = lengths(map0)[map$is_a] | |
## create a new data.frame representing the relationship between | |
## the original id and next level of is_a relationship | |
map = data.frame( | |
id = rep(map$id, n), | |
is_a = unlist(map0[map$is_a], use.names = FALSE) | |
) | |
## stop when there are no more is_a relationships to pursue | |
if (nrow(map) == 0L) break | |
message(nrow(map)) | |
## update the table of distances | |
term1 = map$id | |
term2 = map$is_a | |
distance = rbind(distance, data.frame(term1, term2, step)) | |
} | |
nrow(distance) | |
## keep the first (shortest) distance? | |
dup <- duplicated(distance[,1:2]) | |
shortest <- distance[!dup,] | |
nrow(shortest) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment