Last active
June 12, 2024 00:55
-
-
Save larryyangsen/3d874235791f6ba22d03958a01c8b76d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { youtube } from './youtube.mjs' | |
import { searchYoutubeWithInSubtitles } from './searchYoutubeWithInSubtitles.mjs' | |
import { readYoutubeTxt, sleep } from './lib.mjs'; | |
const isSearchYoutube = process.argv.includes('--search'); | |
const downloadYoutubeFromUrls = async (urls) => | |
urls.reduce(async (promise, url) => { | |
await promise; | |
await youtube(url); | |
}, Promise.resolve()); | |
async function main(){ | |
const urls = isSearchYoutube ? await searchYoutubeWithInSubtitles('Nvidia') : readYoutubeTxt(); | |
await sleep(1500); | |
console.log('urls: ', urls); | |
await downloadYoutubeFromUrls(urls); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
export const sleep = (ms = 1000) => new Promise((resolve) => setTimeout(resolve, ms)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import puppeteer from 'puppeteer'; | |
import fs from 'fs'; | |
import { sleep } from './lib.mjs'; | |
export const searchYoutubeWithInSubtitles = async (query = '新聞', count = 10) => { | |
const browser = await puppeteer.launch({ | |
headless: true, | |
args: ['--lang=zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7'], | |
}); | |
const page = await browser.newPage(); | |
await page.setViewport({ width: 1920, height: 1080 }); | |
await page.goto(`https://www.youtube.com/results?search_query=${query}`); | |
await page.waitForSelector('ytd-thumbnail.ytd-video-renderer'); | |
const clickFilterButton = async () => { | |
await page.click('#filter-button'); | |
await page.waitForSelector('tp-yt-iron-overlay-backdrop.opened'); | |
}; | |
const scraperLinks = () => | |
page.evaluate(() => { | |
const anchors = Array.from(document.querySelectorAll('#video-title')); | |
return anchors | |
.map((anchor) => anchor.href) | |
.filter((href) => href?.includes('watch?v=')) | |
.map((link) => link.split('&')[0]); | |
}); | |
const scrollDown = async () => { | |
await page.evaluate(() => { | |
window.scrollBy(0, window.innerHeight); | |
}); | |
}; | |
const scrollDownToBottom = async () => { | |
let prevHeight = 0; | |
let currHeight = -1; | |
while (prevHeight !== currHeight) { | |
prevHeight = currHeight; | |
await scrollDown(); | |
await sleep(1000); | |
currHeight = await page.evaluate(() => document.body.scrollHeight); | |
} | |
}; | |
for (const selector of [ | |
'a div[title="搜尋「字幕」"]', | |
// 'a div[title="搜尋「本月」"]', | |
'a div[title="搜尋「4 分鐘內」"]', | |
// 'a div[title="搜尋「4 至 20 分鐘」"]', | |
]) { | |
await clickFilterButton(); | |
await page.click(selector); | |
await page.waitForSelector('tp-yt-paper-dialog[aria-hidden="true"]'); | |
await sleep(1000); | |
} | |
let links = []; | |
for (let i = 0; i < count; i++) { | |
links = [...links, ...(await scraperLinks())]; | |
await scrollDownToBottom(); | |
await sleep(1000); | |
} | |
const uniqueLinks = [...new Set(links)]; | |
await browser.close(); | |
fs.writeFileSync('./youtube.txt', uniqueLinks.join('\n')); | |
return uniqueLinks; | |
}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import youtubeDl from 'youtube-dl-exec'; | |
import Logger from 'progress-estimator'; | |
const logger = Logger(); | |
const OUTPUT_PATH = process.env.OUTPUT_PATH || './output'; | |
export const youtube = async (url)=>{ | |
const res = await youtubeDl(url, { | |
dumpSingleJson: true, | |
addHeader: ['referer:youtube.com', 'user-agent:googlebot'], | |
}); | |
const idFromUrl = url.split('v=')[1]; | |
try { | |
const output = `${OUTPUT_PATH}/${res?.id ?? idFromUrl}`; | |
const promise = youtubeDl( | |
url, | |
{ | |
audioFormat: 'mp3', | |
// keepVideo: true, | |
keepVideo: false, | |
extractAudio: true, | |
subFormat: 'vtt', | |
convertSubs: 'vtt', | |
maxDownloads: 3, | |
writeSub: true, | |
output, | |
allSubs: true, | |
noCheckCertificates: true, | |
cacheDir: './tmp', | |
// mergeOutputFormat: 'mp4', | |
}, | |
{} | |
); | |
await logger(promise, `Obtaining ${url}`); | |
} catch (e) { | |
console.log('e: ', e); | |
// console.log('res: ', res); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment