Created
March 7, 2019 02:08
-
-
Save larry1001/5a23678482b4a868981ce5c9f6cd64a5 to your computer and use it in GitHub Desktop.
puppeteer 抓取新浪微博
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require("puppeteer"); | |
const qs = require('querystringify'); | |
const userName = "XXXX"; | |
const passWord = "XXXX"; | |
const sleep = require("./utils").sleep; | |
const log = require('log4js').getLogger("weibo"); | |
const weiboconfig = require("./config").weibo; | |
log.level = 'info'; | |
const index = async function (nickname) { | |
// 下面异步相当于打开浏览器 | |
const browser = await puppeteer.launch({ | |
headless: false, | |
args: [ | |
"--window-size=1360,768" | |
] | |
}); | |
// 开一个新的页签,准备抓取微博数据 | |
const pageWeiboUser = await browser.newPage(); | |
await pageWeiboUser.setViewport({ | |
width: 1360, | |
height: 768 | |
}); | |
//开启控制台输出 | |
pageWeiboUser.on('console', msg => console.log(msg.text())); | |
pageWeiboUser.on('request', (interceptedRequest) => { | |
const url = interceptedRequest.url(); | |
if(!/$[jpg|png|css]/g.test(url)) { | |
console.log('A request url was made:', interceptedRequest.url()); | |
} | |
// console.log('A request headers was made:', interceptedRequest.headers()); | |
// console.log('A request post was made:', interceptedRequest.postData()); | |
}); | |
pageWeiboUser.on('response', (res) => { | |
// console.log('res headers was ', res.headers()); | |
}); | |
await pageWeiboUser.exposeFunction('wbconfig', async () => { | |
return new Promise((resolve, reject) => { | |
resolve(weiboconfig); | |
}); | |
}); | |
const login = async function () { | |
log.info("开始登录"); | |
await pageWeiboUser.goto("https://www.weibo.com", { | |
timeout: 0 | |
}); | |
// 等待浏览器加载完毕 | |
await pageWeiboUser.waitForNavigation({ | |
waitUntil: ["load"], | |
timeout: 0 | |
}); | |
// 模拟输入用户名 | |
console.log("输入用户名..."); | |
await pageWeiboUser.waitForSelector("#loginname"); | |
await pageWeiboUser.focus("#loginname"); | |
await pageWeiboUser.keyboard.type(userName, { | |
delay: 10 | |
}); | |
// 模拟输入密码 | |
console.log("输入密码..."); | |
// 等待浏览器中出现元素`input[name=password]` | |
await pageWeiboUser.waitForSelector("input[name=password]"); | |
await pageWeiboUser.focus("input[name=password]"); | |
await pageWeiboUser.keyboard.type(passWord, { | |
delay: 10 | |
}); | |
// 模拟点击登录 | |
log.info("登录中..."); | |
await pageWeiboUser.click("a[action-type=btn_submit]", { | |
delay: 500 | |
}); | |
await pageWeiboUser.waitForNavigation({ | |
waitUntil: ["load"], | |
timeout: 0 | |
}); | |
log.info("登录完成"); | |
}; | |
/** | |
* 滚动直到分页bar出来 | |
* @return {[type]} [description] | |
*/ | |
let scrollToPageBar = async function () { | |
let pageBar = await pageWeiboUser.$("div[node-type=feed_list_page]"); | |
while (!pageBar) { | |
// 传递命令给浏览器,让浏览器执行滚动 | |
await pageWeiboUser.evaluate((scrollStep) => { | |
let scrollTop = document.scrollingElement.scrollTop; | |
document.scrollingElement.scrollTop = scrollTop + scrollStep; | |
}, 1000); | |
await sleep(2000); | |
pageBar = await pageWeiboUser.$("div[node-type=feed_list_page]") | |
} | |
}; | |
/** | |
* 点击下一页面按钮 | |
* @return {[type]} [description] | |
*/ | |
let gotoNextPage = async function (pageNum) { | |
await pageWeiboUser.goto("https://weibo.com/" + nickname + "?is_search=0&visible=0&is_ori=1&is_tag=0&profile_ftype=1&page=" + pageNum + "#feedtop"); | |
await pageWeiboUser.addScriptTag({ | |
url: "https://cdn.bootcss.com/jquery/3.3.1/jquery.min.js" | |
}); | |
}; | |
/** | |
* 获取带抓取微博的总页数 | |
* @return {[type]} [description] | |
*/ | |
let getTotalPage = async function () { | |
await scrollToPageBar(); | |
// 发送命令获取总页数 | |
let pageInfo = await pageWeiboUser.evaluate(() => { | |
let pageMore = $("div[node-type=feed_list_page] div > span > a"); | |
let pageInfo = pageMore.attr("action-data"); | |
return pageInfo; | |
}); | |
let pageInfoObj = qs.parse(pageInfo); | |
return pageInfoObj.countPage; | |
}; | |
/** | |
* 抓取当前页面的微博 | |
* @return {[type]} [description] | |
*/ | |
let getWeiboContent = async function (pageNum) { | |
await scrollToPageBar(); | |
await pageWeiboUser.evaluate(() => { | |
document.scrollingElement.scrollTop = 300; | |
}); | |
// 获取微博个数 | |
let count = await pageWeiboUser.evaluate(() => { | |
return $("div[action-type=feed_list_item]").length; | |
}); | |
log.info("weibo count " + count); | |
const wc = await pageWeiboUser.evaluate(async () => { | |
let weiboes = [...$("div[action-type=feed_list_item]")]; | |
return weiboes.map(weibo => { | |
console.log($(weibo).html()); | |
return { | |
weiboId: $.trim($(weibo).attr("mid")), | |
content: $.trim($(weibo).find("div[node-type=feed_list_content]").text()), | |
create_time: $.trim($(weibo).find("[node-type=feed_list_item_date]").attr("title")), | |
weibo_url: window.wbconfig.domain + $(weibo).find("[node-type=feed_list_item_date]").attr("href"), | |
repost_num: $(weibo).find("[action-type=fl_forward] em:eq(1)").text() | |
} | |
}); | |
}); | |
for (we of wc) { | |
console.log(we); | |
} | |
// let weibo = await pageWeiboUser.$("#spider_" + i); | |
// await weibo.screenshot({ | |
// path: "./screenshots/" + pageNum + "_" + (i + 1) + ".png" | |
// }); | |
process.stdout.write("."); | |
await sleep(50); | |
process.stdout.write("\n"); | |
}; | |
await login(); | |
let pageNum = 1; | |
await pageWeiboUser.goto("https://weibo.com/" + nickname + "?profile_ftype=1&is_ori=1"); | |
await pageWeiboUser.waitFor("div#plc_frame"); | |
await pageWeiboUser.addScriptTag({ | |
url: "https://cdn.bootcss.com/jquery/3.3.1/jquery.min.js" | |
}); | |
let countPage = await getTotalPage(); | |
while (countPage >= pageNum) { | |
console.log("开始抓取第[" + pageNum + "]页数据..."); | |
await getWeiboContent(pageNum); | |
console.log("第[" + pageNum + "]页数据抓取结束"); | |
pageNum++; | |
await gotoNextPage(pageNum); | |
} | |
console.log("\n\n抓取结束"); | |
await browser.close(); | |
}; | |
index("bbshefeicc"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment