Skip to content

Instantly share code, notes, and snippets.

@mrchypark
Last active June 7, 2019 16:01
Show Gist options
  • Save mrchypark/651b3a7cabbd12bbe4b851b57b23a863 to your computer and use it in GitHub Desktop.
Save mrchypark/651b3a7cabbd12bbe4b851b57b23a863 to your computer and use it in GitHub Desktop.
library(httr)
library(rvest)
library(dplyr)
library(magrittr)
library(stringr)
library(jsonlite)
library(purrr)
library(tibble)
library(openssl)
library(crul)
get_links <- function(tar){
GET(tar) %>%
content() %>%
str_split(',"pagingHtml') %>%
.[[1]] %>%
.[1] %>%
str_c(.,"}") %>%
fromJSON()
}
size <- 30
root <- "https://blog.naver.com/PostTitleListAsync.nhn?blogId=kdh4548&viewdate=&currentPage="
seq2 <- str_c("&categoryNo=0&parentCategoryNo=&countPerPage=", size)
str_c(root, 1, seq2) %>%
get_links() ->
dat
tcnt <- as.integer(dat$totalCount)
1:(as.integer(tcnt/size) + 1) %>%
str_c(root, ., seq2) %>%
map( ~ get_links(.x) %>%
.$postList %>%
.$logNo) %>%
flatten() %>%
unlist() ->
links
root <- "https://blog.naver.com/kdh4548/"
links %>%
str_c(root, .) %>%
# .[1:10] %>%
map(
~ slowly(read_html)(.x) %>%
html_nodes("iframe#mainFrame") %>%
html_attr("src")
) %>%
flatten() %>%
unlist() ->
in_frame
root <- "https://blog.naver.com"
in_frame %>%
str_c(root, .) ->
tar
dir.create("img")
for (i in 1:length(tar)) {
cat("now : ",i," / ", length(tar), "\n")
tar[i] %>%
read_html() %>%
html_nodes("img") %>%
html_attr("src") %>%
enframe(name = NULL) %>%
filter(str_detect(value, "postfiles")) %>%
pull(value) %>%
str_split("type=") %>%
map(
~ {
tem <- c()
tem[1] <- .x %>% .[1] %>% str_c("type=w580")
tem[2] <- .x %>% .[1] %>% str_split("/") %>% .[[1]] %>% .[5] %>% str_replace(fixed(".kdh4548"),"")
tem
}
) ->
tar_img
print(length(tar_img))
if (length(tar_img) == 0) {
next
}
cnt <- length(tar_img)
lst <- cnt %/% 100 + 1
if (cnt %% 100 == 0) {
lst <- cnt %/% 100
}
if (length(tar_img) == 1) {
GET(tar_img[[1]][1], write_disk(str_c("img/", tar_img[[1]][2])))
next
}
for (j in 1:lst) {
str <- 1 + 100 * (j - 1)
if (j * 100 > cnt) {
end <- cnt
} else {
end <- 100 * j
}
cc <- Async$new( urls = tar_img[str:end] %>% transpose() %>% .[[1]] %>% unlist() )
res <- cc$get(disk = str_c("img/", tar_img[str:end] %>% transpose() %>% .[[2]]), verbose = F)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment