Last active
August 27, 2025 11:49
-
-
Save agricolamz/96a610fd3e029350b2f814c13f92823d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code is licensed under the terms of the MIT license | |
# Author: George Moroz | |
# Date: 2025-08-14 | |
# Update: 2025-08-27 | |
# see the oficial docs: https://ruscorpora.github.io/public-api/ | |
library(tidyverse) | |
my_token <- "put your token here" | |
lemma_for_search <- "накачать" | |
library(httr2) | |
# check authentication ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/auth/check-authenticated/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_perform() | |
# <httr2_response> | |
# GET https://ruscorpora.ru/api/v1/auth/check-authenticated/ | |
# Status: 200 OK | |
# Content-Type: application/json | |
# Body: In memory (4 bytes) | |
# PORTRAIT_WORD_INFO ------------------------------------------------------ | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_WORD_INFO"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
result$propsData$items |> | |
map(function(i){ | |
if(length(i$parsingFields) == 4){ | |
tibble(text = paste0(unlist(i$parsingFields[[1]]$value), collapse = "; "), | |
lex = paste0(unlist(i$parsingFields[[2]]$value), collapse = "; "), | |
gr = paste0(unlist(i$parsingFields[[3]]$value), collapse = "; "), | |
sem = paste0(unlist(i$parsingFields[[4]]$value), collapse = "; ")) | |
} else if(length(i$parsingFields) == 3){ | |
tibble(text = paste0(unlist(i$parsingFields[[1]]$value), collapse = "; "), | |
lex = paste0(unlist(i$parsingFields[[2]]$value), collapse = "; "), | |
gr = paste0(unlist(i$parsingFields[[3]]$value), collapse = "; ")) | |
} | |
}) |> | |
list_rbind() | |
# PORTRAIT_CONCORDANCE ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_CONCORDANCE"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
result$concordanceData$groups |> | |
seq_along() |> | |
map(function(j){ | |
tibble(field = result$concordanceData$groups[[j]]$docs[[1]]$info$docExplainInfo$items[[1]]$parsingFields |> | |
map_chr("name"), | |
value = result$concordanceData$groups[[j]]$docs[[1]]$info$docExplainInfo$items[[1]]$parsingFields |> | |
map("value") |> | |
unlist()) |> | |
pivot_wider(names_from = "field", values_from = "value") |> | |
mutate(title = result$concordanceData$groups[[j]]$docs[[1]]$info$title, | |
language = result$concordanceData$groups[[j]]$docs[[1]]$snippetGroups[[1]]$snippets[[1]]$langInfo$lang, | |
text = result$concordanceData$groups[[j]]$docs[[1]]$snippetGroups[[1]]$snippets[[1]]$sequences[[1]]$words |> | |
map_chr("text") |> | |
str_c(collapse = "") |> | |
str_squish()) | |
}) |> | |
list_rbind() |> | |
mutate(lemma = lemma_for_search) |> | |
relocate(title, .before = 1) |> | |
relocate(text, .before = 1) |> | |
relocate(lemma, .before = 1) | |
# PORTRAIT_STATS ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_STATS"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
result$statsData$fieldStats |> | |
seq_along() |> | |
map(function(i){ | |
result$statsData$fieldStats[[i]]$values |> | |
seq_along() |> | |
map(function(j){ | |
tibble(value = result$statsData$fieldStats[[i]]$values[[j]]$key$valString$v, | |
count = result$statsData$fieldStats[[i]]$values[[j]]$count, | |
docCount = result$statsData$fieldStats[[i]]$values[[j]]$docCount, | |
totalCount = result$statsData$fieldStats[[i]]$values[[j]]$totalCount, | |
totalDocCount = result$statsData$fieldStats[[i]]$values[[j]]$totalDocCount) | |
}) |> | |
list_rbind() |> | |
mutate(field = result$statsData$fieldStats[[i]]$field) |> | |
relocate(field, .before = 1) | |
}) |> | |
list_rbind() | |
# PORTRAIT_SKETCH ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_SKETCH"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
result$sketchData$collocates |> | |
seq_along() |> | |
map(function(i){ | |
result$sketchData$collocates[[i]]$collocations |> | |
seq_along() |> | |
map(function(j){ | |
tibble(collocate = result$sketchData$collocates[[i]]$collocations[[j]]$collocate$valString$v, | |
dice = result$sketchData$collocates[[i]]$collocations[[j]]$metrics[[1]]$value) | |
}) |> | |
list_rbind() |> | |
mutate(syntactic_relation = result$sketchData$collocates[[i]][["sketchSynRelation"]]) | |
}) |> | |
list_rbind() |> | |
mutate(lemma = lemma_for_search) | |
# PORTRAIT_FREQUENCY ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_FREQUENCY"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
result$frequencyData$ipm | |
# PORTRAIT_SIMILAR ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_SIMILAR"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
tibble(word = result$similarData[[1]]$values |> map_chr("word"), | |
metics = result$similarData[[1]]$values |> map_dbl("weight")) | |
# PORTRAIT_MORPHEME ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_MORPHEME"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
tibble(glossed = result$morphemeData$morphemes |> map_chr("value"), | |
morph_type = result$morphemeData$morphemes |> map_chr("type")) | |
# PORTRAIT_WORDFORMS ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_WORDFORMS"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
result$wordformsData$values |> | |
seq_along() |> | |
map(function(i){ | |
tibble(case = result$wordformsData$values[[i]]$rowLabel$v, | |
number = result$wordformsData$values[[i]]$columnLabel$v, | |
form = result$wordformsData$values[[i]]$wfValue$value, | |
ipm = result$wordformsData$values[[i]]$wfValue$freq$ipm, | |
# docs: категория от 1 до 3. Где 1 - наиболее встречаемая форма, 3 - наименее. | |
category = result$wordformsData$values[[i]]$wfValue$freq$category) | |
}) |> | |
list_rbind() | |
# PORTRAIT_COGNATES ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_COGNATES"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
# does not work yet | |
# PORTRAIT_FIRST_MENTION ---------------------------------------------------- | |
request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
req_headers("accept" = "application/json") |> | |
req_headers("Content-Type" = "application/json") |> | |
req_body_json(list(lemma = lemma_for_search, | |
corpus = list(type = "MAIN"), | |
resultType = list("PORTRAIT_FIRST_MENTION"))) |> | |
req_perform() |> | |
resp_body_json() -> | |
result | |
result$firstMentionData$info$items[[1]]$parsingFields |> | |
seq_along() |> | |
map(function(i){ | |
tibble(field = result$firstMentionData$info$items[[1]]$parsingFields[[i]]$name, | |
value = unlist(result$firstMentionData$info$items[[1]]$parsingFields[[i]]$value)) | |
}) |> | |
list_rbind() |> | |
pivot_wider(names_from = field, values_from = value) |> | |
mutate(redirect_lemma = result$firstMentionData$redirectLemma, | |
redirect_corpus = result$firstMentionData$redirectCorpus$type) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment