Skip to content

Instantly share code, notes, and snippets.

@agricolamz
Last active December 13, 2024 02:44
Show Gist options
  • Save agricolamz/f40e2c809991351ed975cd955a8ac223 to your computer and use it in GitHub Desktop.
Save agricolamz/f40e2c809991351ed975cd955a8ac223 to your computer and use it in GitHub Desktop.
library(tidyverse)
input <- "Мой друг, которого я давно не видел, крепко спит."
tmp_input <- tempfile(fileext = ".txt")
tmp_output <- tempfile(fileext = ".txt")
write_lines(input, tmp_input)
str_c("mystem -cdni --eng-gr ", tmp_input, " ", tmp_output) |>
system()
read_lines(tmp_output) |>
enframe() |>
mutate(value = str_remove(value, "_"),
value = str_remove(value, "\\\\n")) |>
filter(value != "") |>
mutate(mystem_feats = str_extract(value, "\\{.*\\}"),
value = str_remove(value, "\\{.*\\}"),
mystem_feats = str_remove_all(mystem_feats, "[\\{\\}]"),
mystem_lemma = str_extract(mystem_feats, ".*?="),
mystem_lemma = str_remove(mystem_lemma, "="),
mystem_feats = str_remove(mystem_feats, ".*?="),
mystem_feats = str_replace(mystem_feats, "=", "|"),
mystem_pos = str_extract(mystem_feats, "\\w{1,}"),
mystem_pos = if_else(is.na(mystem_pos), "PUNCT", mystem_pos),
mystem_feats = str_remove(mystem_feats, "\\w{1,}"),
mystem_feats = str_remove(mystem_feats, "^\\|"),
mystem_feats = str_remove(mystem_feats, "^\\,"),
mystem_feats = if_else(mystem_feats == "", NA, mystem_feats),
mystem_sentence = read_lines(tmp_input),
mystem_id = as.character(1:n())) |>
rename(mystem_token = value) |>
select(mystem_id, mystem_sentence, mystem_token, mystem_lemma, mystem_pos, mystem_feats) ->
mystem_result
unlink(tmp_input)
unlink(tmp_output)
mystem_result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment