George Moroz agricolamz

I'm teaching Linguistics and working on phonetics, linguistic mapping, statistics and indigenous language of the Caucasus

174 followers · 79 following

Linguistic Convergence Laboratory, NRU HSE (https://ilcl.hse.ru/en/)
Russia, Moscow
https://www.hse.ru/en/org/persons/103489498
https://orcid.org/0000-0003-1990-6083
@aGricolaMZ

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

agricolamz / ruscorpora.ru_API.R

Last active August 27, 2025 11:49

	# This code is licensed under the terms of the MIT license
	# Author: George Moroz
	# Date: 2025-08-14
	# Update: 2025-08-27
	# see the oficial docs: https://ruscorpora.github.io/public-api/

	library(tidyverse)
	my_token <- "put your token here"
	lemma_for_search <- "накачать"

agricolamz / df_to_xml.R

Last active March 25, 2025 16:44

	library(tidyverse)

	readxl::read_xlsx("data.xlsx") \|>
	filter(corpus == "Dagestan.xml",
	!is.na(string_id)) ->
	df

	df \|>
	select(where(function(x) sum(is.na(x)) != nrow(df))) \|>
	select(-person_id, -corpus) \|>

agricolamz / ngrams_sample.csv

Created March 12, 2025 11:37

	year	meaning_ru	frequency	corpus
	1800	вафля	0	ru-2019
	1801	вафля	0	ru-2019
	1802	вафля	0	ru-2019
	1803	вафля	0	ru-2019
	1804	вафля	0	ru-2019
	1805	вафля	0	ru-2019
	1806	вафля	0	ru-2019
	1807	вафля	0	ru-2019
	1808	вафля	0	ru-2019

agricolamz / code_for_6_task.R

Last active February 18, 2025 09:23

	library(tidyverse)
	read_csv("https://raw.githubusercontent.com/agricolamz/2024_HSE_b_da4l/master/data/Coretta_2017_icelandic.csv") \|>
	filter(speaker == "tt01") ->
	vowels

	mean_data <- mean(vowels$vowel.dur)
	sd_data <- sd(vowels$vowel.dur)

	m1 <- function(x) dnorm(x, mean = mean_data, sd = sd_data) *
	dnorm(x, mean = 87, sd = 25)

agricolamz / da4l_class_credible_intervals.R

Created February 11, 2025 09:14

	library(tidyverse)
	read_csv("https://raw.githubusercontent.com/agricolamz/2025_HSE_b_da4l/refs/heads/main/data/Coretta_2017_icelandic.csv") \|>
	filter(speaker == "tt01") ->
	vowels

	sd_prior <- 25
	sd_data <- sd(vowels$vowel.dur)
	sd_post <- 1/sqrt(1/sd_prior^2 + 1/sd_data^2)
	mean_prior <- 87
	mean_data <- mean(vowels$vowel.dur)

agricolamz / msa_bivaltyp.R

Created February 5, 2025 13:24

	library(tidyverse)
	library(lingtypology)
	df <- bivaltyp.feature()


	df \|>
	filter(family_WALS == "Nakh-Daghestanian") \|>
	writexl::write_xlsx("~/Desktop/daghestan_bivaltyp.xlsx")

	df \|>

agricolamz / calculate_duration_textgrids.R

Created January 29, 2025 11:11

	library(tidyverse)
	phonfieldwork::read_from_folder("...") \|>
	filter(tier == 2,
	content != "") \|>
	mutate(dur = time_end-time_start) \|>
	summarise(duration_minutes = sum(dur)/60)

agricolamz / 4Sveta_N_mn_nompl.R

Created January 17, 2025 16:59

	library(tidyverse)
	khi <- read_csv("russian_spoken_corpora_analysis/dialect_khislavichi_udpiped_mystemed.csv")
	spi <- read_csv("russian_spoken_corpora_analysis/dialect_spiridonova_buda_udpiped_mystemed.csv")

	khi \|>
	bind_rows(spi) \|>
	filter(mystem_pos == "S",
	str_detect(mystem_feats, "nom.pl"),
	str_detect(mystem_feats, "^[mn],")) \|>
	writexl::write_xlsx("~/Desktop/4Sveta_N_mn_nompl.xlsx")

agricolamz / russian_borrowings_in_andic.R

Created December 18, 2024 10:53

	library(tidyverse)

	read_csv("andic_russian_bor2.csv") \|>
	mutate(language_material = str_count(ipa_comparison, "-"),
	russian_material = str_count(russian_ipa_comparison, "-"),
	diff = if_else(russian_material == language_material, "", "problem")) \|>
	filter(diff != "problem") ->
	df

	df \|>

agricolamz / mystem.R

Last active December 13, 2024 02:44

	library(tidyverse)

	input <- "Мой друг, которого я давно не видел, крепко спит."

	tmp_input <- tempfile(fileext = ".txt")
	tmp_output <- tempfile(fileext = ".txt")
	write_lines(input, tmp_input)

	str_c("mystem -cdni --eng-gr ", tmp_input, " ", tmp_output) \|>
	system()

NewerOlder