Created
June 17, 2025 09:50
-
-
Save memoryfull/96d5110868e1602ef4c9bc41ea93dc91 to your computer and use it in GitHub Desktop.
This code seeks to replicate Syomin (2025) in-text statement: "For a validation, a list of law firms’ unique tax IDs obtained from the SME registry for the year 2023 is matched with the list of law firms’ tax IDs from the Russian Financial Statements Database (RFSD) [31] for the same year. The comparison shows that ~3,300 of the firms present in…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(arrow) | |
################## | |
# Load RFSD | |
RFSD <- open_dataset("local/path/to/RFSD") | |
scan_builder <- RFSD$NewScan() | |
scan_builder$Filter(Expression$field_ref("year") >= 2011 & Expression$field_ref("year") <= 2023) | |
scan_builder$Project(cols = c("inn", "ogrn", "year", "okved_section", "okved", "eligible", "filed", "imputed", "outlier", "line_1150", "line_2110")) | |
scanner <- scan_builder$Finish() | |
rfsd_panel <- as.data.table(scanner$ToTable()) | |
gc() | |
# Rename variables | |
setnames(rfsd_panel, c("line_1150", "line_2110"), | |
c("capital", "revenue"), | |
skip_absent = T) | |
# Only firms with non-missing INNs | |
rfsd_panel <- rfsd_panel[!is.na(inn) ] | |
# Mark firms with industry code in legal services | |
rfsd_panel[, law_firm := as.numeric(substr(okved, 1, 4) == "69.1")] | |
# 2023 cross-section | |
rfsd_crossection <- rfsd_panel[year == 2023] | |
################## | |
# Load SME register from Syomin (2025), https://zenodo.org/records/14942591 | |
syomin_panel <- as.data.table(read_parquet("law-firms-names-materials/datasets/law-firms/panel.parquet")) | |
# Only firms in 2023, as in the comparison in the paper: | |
# "For a validation, a list of law firms’ unique tax IDs obtained | |
# from the SME registry for the year 2023 is matched with the list | |
# of law firms’ tax IDs from the Russian Financial Statements | |
# Database (RFSD) for the same year." | |
syomin_crossection <- syomin_panel[ year == 2023 & kind == 1] | |
################## | |
# Perform the comparison | |
# Law firms missing in Syomin crossection but present in RFSD crossection | |
syomin_missing <- rfsd_crossection[ !(inn %in% syomin_crossection$tin) ][ law_firm == 1 ] | |
nrow(syomin_missing) # 7283 | |
# Law firms missing in RFSD cross section but present in Syomin data | |
rfsd_missing <- syomin_crossection[ !(tin %in% rfsd_crossection[law_firm == 1]$inn)][, c("tin", "org_name")] | |
setnames(rfsd_missing, "tin", "inn") | |
nrow(rfsd_missing) # 1150 | |
# Attach OKVED for the missing firms | |
rfsd_missing <- merge(rfsd_missing, rfsd_crossection[, c("inn", "okved", "law_firm")], by = "inn", all.x = T, all.y = F) | |
# Most of the discrepancy is due to the wrong okved code | |
nrow(rfsd_missing[law_firm == 0 ]) # 888 | |
# Examine the genuine | |
nrow(rfsd_missing[law_firm == 1 | is.na(okved)]) # 262 | |
# Mark firms missing in RFSD cross-section but present in panel | |
rfsd_missing[, is_in_panel := as.integer(inn %in% rfsd_panel$inn)] | |
nrow(rfsd_missing[is_in_panel == 1]) # 943 | |
# Examples of missings in Syomin data that are present in RFSD | |
print(syomin_missing[, c("inn", "okved", "law_firm")], 25) | |
# Examples of false missings in RFSD data that are present in Syomin data | |
# due to industry code mismatch | |
print(rfsd_missing[law_firm == 0, c("inn", "okved", "law_firm", "is_in_panel")], 25) | |
# Examples of true missings in RFSD data that are present in Syomin data | |
print(rfsd_missing[law_firm == 1 | is.na(okved), c("inn", "okved", "law_firm", "is_in_panel")], 25) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment