Skip to content

Instantly share code, notes, and snippets.

@memoryfull
Created June 17, 2025 09:50
Show Gist options
  • Save memoryfull/96d5110868e1602ef4c9bc41ea93dc91 to your computer and use it in GitHub Desktop.
Save memoryfull/96d5110868e1602ef4c9bc41ea93dc91 to your computer and use it in GitHub Desktop.
This code seeks to replicate Syomin (2025) in-text statement: "For a validation, a list of law firms’ unique tax IDs obtained from the SME registry for the year 2023 is matched with the list of law firms’ tax IDs from the Russian Financial Statements Database (RFSD) [31] for the same year. The comparison shows that ~3,300 of the firms present in…
library(data.table)
library(arrow)
##################
# Load RFSD
RFSD <- open_dataset("local/path/to/RFSD")
scan_builder <- RFSD$NewScan()
scan_builder$Filter(Expression$field_ref("year") >= 2011 & Expression$field_ref("year") <= 2023)
scan_builder$Project(cols = c("inn", "ogrn", "year", "okved_section", "okved", "eligible", "filed", "imputed", "outlier", "line_1150", "line_2110"))
scanner <- scan_builder$Finish()
rfsd_panel <- as.data.table(scanner$ToTable())
gc()
# Rename variables
setnames(rfsd_panel, c("line_1150", "line_2110"),
c("capital", "revenue"),
skip_absent = T)
# Only firms with non-missing INNs
rfsd_panel <- rfsd_panel[!is.na(inn) ]
# Mark firms with industry code in legal services
rfsd_panel[, law_firm := as.numeric(substr(okved, 1, 4) == "69.1")]
# 2023 cross-section
rfsd_crossection <- rfsd_panel[year == 2023]
##################
# Load SME register from Syomin (2025), https://zenodo.org/records/14942591
syomin_panel <- as.data.table(read_parquet("law-firms-names-materials/datasets/law-firms/panel.parquet"))
# Only firms in 2023, as in the comparison in the paper:
# "For a validation, a list of law firms’ unique tax IDs obtained
# from the SME registry for the year 2023 is matched with the list
# of law firms’ tax IDs from the Russian Financial Statements
# Database (RFSD) for the same year."
syomin_crossection <- syomin_panel[ year == 2023 & kind == 1]
##################
# Perform the comparison
# Law firms missing in Syomin crossection but present in RFSD crossection
syomin_missing <- rfsd_crossection[ !(inn %in% syomin_crossection$tin) ][ law_firm == 1 ]
nrow(syomin_missing) # 7283
# Law firms missing in RFSD cross section but present in Syomin data
rfsd_missing <- syomin_crossection[ !(tin %in% rfsd_crossection[law_firm == 1]$inn)][, c("tin", "org_name")]
setnames(rfsd_missing, "tin", "inn")
nrow(rfsd_missing) # 1150
# Attach OKVED for the missing firms
rfsd_missing <- merge(rfsd_missing, rfsd_crossection[, c("inn", "okved", "law_firm")], by = "inn", all.x = T, all.y = F)
# Most of the discrepancy is due to the wrong okved code
nrow(rfsd_missing[law_firm == 0 ]) # 888
# Examine the genuine
nrow(rfsd_missing[law_firm == 1 | is.na(okved)]) # 262
# Mark firms missing in RFSD cross-section but present in panel
rfsd_missing[, is_in_panel := as.integer(inn %in% rfsd_panel$inn)]
nrow(rfsd_missing[is_in_panel == 1]) # 943
# Examples of missings in Syomin data that are present in RFSD
print(syomin_missing[, c("inn", "okved", "law_firm")], 25)
# Examples of false missings in RFSD data that are present in Syomin data
# due to industry code mismatch
print(rfsd_missing[law_firm == 0, c("inn", "okved", "law_firm", "is_in_panel")], 25)
# Examples of true missings in RFSD data that are present in Syomin data
print(rfsd_missing[law_firm == 1 | is.na(okved), c("inn", "okved", "law_firm", "is_in_panel")], 25)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment