Skip to content

Instantly share code, notes, and snippets.

@mrchypark
Last active May 31, 2019 08:02
Show Gist options
  • Save mrchypark/929a70b71c1600ed61e57860930cde06 to your computer and use it in GitHub Desktop.
Save mrchypark/929a70b71c1600ed61e57860930cde06 to your computer and use it in GitHub Desktop.
library(rvest)
library(writexl)
library(dplyr)
library(stringr)
body <- c()
case_no <- c()
for (j in 1:3){
print(j)
tar_root <- "https://casenote.kr/search/?q=%EC%82%BC%EC%84%B1"
tar <- paste0(tar_root, "&p=", j)
read_html(tar) %>%
html_nodes('a[class="casename"]') %>%
html_attr("href") -> dat
root <- "https://casenote.kr"
tar <- paste0(root, dat)
for (i in 1:length(tar)){
read_html(tar[i]) -> dat_tar
dat_tar %>%
html_nodes('div[id="header"] div') %>%
html_text %>%
.[2] -> case_no[10*(j-1) + i]
dat_tar %>%
html_nodes('div[id="mainbar"] p') %>%
html_attr("class") -> tem
lob <- which(str_detect(tem, "title"))[1:2]
dat_tar %>%
html_nodes('div[id="mainbar"] p') %>%
html_text %>%
.[seq(lob[1]+1, lob[2]-1)] -> body[10*(j-1) + i]
}
}
write_xlsx(data.frame(case_no, body), "case.xlsx")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment