Skip to content

Instantly share code, notes, and snippets.

View agricolamz's full-sized avatar

George Moroz agricolamz

View GitHub Profile
library(tidyverse)
readxl::read_xlsx("data.xlsx") |>
filter(corpus == "Dagestan.xml",
!is.na(string_id)) ->
df
df |>
select(where(function(x) sum(is.na(x)) != nrow(df))) |>
select(-person_id, -corpus) |>
year meaning_ru frequency corpus
1800 вафля 0 ru-2019
1801 вафля 0 ru-2019
1802 вафля 0 ru-2019
1803 вафля 0 ru-2019
1804 вафля 0 ru-2019
1805 вафля 0 ru-2019
1806 вафля 0 ru-2019
1807 вафля 0 ru-2019
1808 вафля 0 ru-2019
library(tidyverse)
read_csv("https://raw.githubusercontent.com/agricolamz/2024_HSE_b_da4l/master/data/Coretta_2017_icelandic.csv") |>
filter(speaker == "tt01") ->
vowels
mean_data <- mean(vowels$vowel.dur)
sd_data <- sd(vowels$vowel.dur)
m1 <- function(x) dnorm(x, mean = mean_data, sd = sd_data) *
dnorm(x, mean = 87, sd = 25)
library(tidyverse)
read_csv("https://raw.githubusercontent.com/agricolamz/2025_HSE_b_da4l/refs/heads/main/data/Coretta_2017_icelandic.csv") |>
filter(speaker == "tt01") ->
vowels
sd_prior <- 25
sd_data <- sd(vowels$vowel.dur)
sd_post <- 1/sqrt(1/sd_prior^2 + 1/sd_data^2)
mean_prior <- 87
mean_data <- mean(vowels$vowel.dur)
library(tidyverse)
library(lingtypology)
df <- bivaltyp.feature()
df |>
filter(family_WALS == "Nakh-Daghestanian") |>
writexl::write_xlsx("~/Desktop/daghestan_bivaltyp.xlsx")
df |>
library(tidyverse)
phonfieldwork::read_from_folder("...") |>
filter(tier == 2,
content != "") |>
mutate(dur = time_end-time_start) |>
summarise(duration_minutes = sum(dur)/60)
library(tidyverse)
khi <- read_csv("russian_spoken_corpora_analysis/dialect_khislavichi_udpiped_mystemed.csv")
spi <- read_csv("russian_spoken_corpora_analysis/dialect_spiridonova_buda_udpiped_mystemed.csv")
khi |>
bind_rows(spi) |>
filter(mystem_pos == "S",
str_detect(mystem_feats, "nom.pl"),
str_detect(mystem_feats, "^[mn],")) |>
writexl::write_xlsx("~/Desktop/4Sveta_N_mn_nompl.xlsx")
library(tidyverse)
read_csv("andic_russian_bor2.csv") |>
mutate(language_material = str_count(ipa_comparison, "-"),
russian_material = str_count(russian_ipa_comparison, "-"),
diff = if_else(russian_material == language_material, "", "problem")) |>
filter(diff != "problem") ->
df
df |>
library(tidyverse)
input <- "Мой друг, которого я давно не видел, крепко спит."
tmp_input <- tempfile(fileext = ".txt")
tmp_output <- tempfile(fileext = ".txt")
write_lines(input, tmp_input)
str_c("mystem -cdni --eng-gr ", tmp_input, " ", tmp_output) |>
system()
library(tidyverse)
df <- read_csv("https://raw.githubusercontent.com/LingConLab/rutul_dialectology/master/data/database.csv")
df |>
select(feature_title, feature_lexeme, value, settlement, value) |>
filter(!is.na(value),
value != "NO DATA",
value != "OTHER",
value != "\\?",
!(settlement %in% c("Tsudik", "Borch"))) |>
mutate(value = str_split(value, ";")) |>