Skip to content

Instantly share code, notes, and snippets.

@larryyangsen
Last active June 12, 2024 00:55
Show Gist options
  • Save larryyangsen/3d874235791f6ba22d03958a01c8b76d to your computer and use it in GitHub Desktop.
Save larryyangsen/3d874235791f6ba22d03958a01c8b76d to your computer and use it in GitHub Desktop.
import { youtube } from './youtube.mjs'
import { searchYoutubeWithInSubtitles } from './searchYoutubeWithInSubtitles.mjs'
import { readYoutubeTxt, sleep } from './lib.mjs';
const isSearchYoutube = process.argv.includes('--search');
const downloadYoutubeFromUrls = async (urls) =>
urls.reduce(async (promise, url) => {
await promise;
await youtube(url);
}, Promise.resolve());
async function main(){
const urls = isSearchYoutube ? await searchYoutubeWithInSubtitles('Nvidia') : readYoutubeTxt();
await sleep(1500);
console.log('urls: ', urls);
await downloadYoutubeFromUrls(urls);
}
export const sleep = (ms = 1000) => new Promise((resolve) => setTimeout(resolve, ms));
import puppeteer from 'puppeteer';
import fs from 'fs';
import { sleep } from './lib.mjs';
export const searchYoutubeWithInSubtitles = async (query = '新聞', count = 10) => {
const browser = await puppeteer.launch({
headless: true,
args: ['--lang=zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7'],
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.goto(`https://www.youtube.com/results?search_query=${query}`);
await page.waitForSelector('ytd-thumbnail.ytd-video-renderer');
const clickFilterButton = async () => {
await page.click('#filter-button');
await page.waitForSelector('tp-yt-iron-overlay-backdrop.opened');
};
const scraperLinks = () =>
page.evaluate(() => {
const anchors = Array.from(document.querySelectorAll('#video-title'));
return anchors
.map((anchor) => anchor.href)
.filter((href) => href?.includes('watch?v='))
.map((link) => link.split('&')[0]);
});
const scrollDown = async () => {
await page.evaluate(() => {
window.scrollBy(0, window.innerHeight);
});
};
const scrollDownToBottom = async () => {
let prevHeight = 0;
let currHeight = -1;
while (prevHeight !== currHeight) {
prevHeight = currHeight;
await scrollDown();
await sleep(1000);
currHeight = await page.evaluate(() => document.body.scrollHeight);
}
};
for (const selector of [
'a div[title="搜尋「字幕」"]',
// 'a div[title="搜尋「本月」"]',
'a div[title="搜尋「4 分鐘內」"]',
// 'a div[title="搜尋「4 至 20 分鐘」"]',
]) {
await clickFilterButton();
await page.click(selector);
await page.waitForSelector('tp-yt-paper-dialog[aria-hidden="true"]');
await sleep(1000);
}
let links = [];
for (let i = 0; i < count; i++) {
links = [...links, ...(await scraperLinks())];
await scrollDownToBottom();
await sleep(1000);
}
const uniqueLinks = [...new Set(links)];
await browser.close();
fs.writeFileSync('./youtube.txt', uniqueLinks.join('\n'));
return uniqueLinks;
};
import youtubeDl from 'youtube-dl-exec';
import Logger from 'progress-estimator';
const logger = Logger();
const OUTPUT_PATH = process.env.OUTPUT_PATH || './output';
export const youtube = async (url)=>{
const res = await youtubeDl(url, {
dumpSingleJson: true,
addHeader: ['referer:youtube.com', 'user-agent:googlebot'],
});
const idFromUrl = url.split('v=')[1];
try {
const output = `${OUTPUT_PATH}/${res?.id ?? idFromUrl}`;
const promise = youtubeDl(
url,
{
audioFormat: 'mp3',
// keepVideo: true,
keepVideo: false,
extractAudio: true,
subFormat: 'vtt',
convertSubs: 'vtt',
maxDownloads: 3,
writeSub: true,
output,
allSubs: true,
noCheckCertificates: true,
cacheDir: './tmp',
// mergeOutputFormat: 'mp4',
},
{}
);
await logger(promise, `Obtaining ${url}`);
} catch (e) {
console.log('e: ', e);
// console.log('res: ', res);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment