Last active
November 28, 2024 05:31
-
-
Save taowen/3a0ee294ae60fd7e8f14f4af81edf38e to your computer and use it in GitHub Desktop.
extract text from https://ar5iv.labs.arxiv.org/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==UserScript== | |
// @name arxiv论文转markdown拷贝到剪贴板 | |
// @description 方便粘贴到 chatgpt 进行问答 | |
// @namespace github.com/taowen | |
// @match https://ar5iv.labs.arxiv.org/* | |
// @match https://browse.arxiv.org/html/* | |
// @match https://arxiv.org/html/* | |
// @version 1.0.1 | |
// @author taowen | |
// @license MIT | |
// @grant GM.registerMenuCommand | |
// @grant GM_setClipboard | |
// ==/UserScript== | |
GM.registerMenuCommand("复制论文到剪贴板", () => { | |
const selection = document.getSelection(); | |
const targetNode = selection.baseNode ? getCommonAncestor(selection.baseNode, selection.extentNode): document.body; | |
const text = extractText(targetNode); | |
GM_setClipboard (text); | |
alert('copied ' + text.length + ' characters'); | |
}); | |
function getCommonAncestor(node1, node2) { | |
var method = "contains" in node1 ? "contains" : "compareDocumentPosition", | |
test = method === "contains" ? 1 : 0x10; | |
while (node1 = node1.parentNode) { | |
if ((node1[method](node2) & test) === test) | |
return node1; | |
} | |
return null; | |
} | |
function extractText(node, parts) { | |
let returnString = false; | |
if (parts === undefined) { parts = []; returnString = true; } | |
if (node.wholeText !== undefined) { parts.push(node.wholeText.trim()); return; } | |
const tagName = (node.tagName || '').toLowerCase(); | |
if (tagName === 'button') { return; } | |
if (tagName === 'h1') { parts.push('\n# ' + node.innerText + '\n'); return; } | |
if (tagName === 'h2') { parts.push('\n## ' + node.innerText + '\n'); return; } | |
if (tagName === 'h3') { parts.push('\n### ' + node.innerText + '\n'); return; } | |
if (tagName === 'h4') { parts.push('\n#### ' + node.innerText + '\n'); return; } | |
if (tagName === 'h5') { parts.push('\n##### ' + node.innerText + '\n'); return; } | |
if (tagName === 'h6') { parts.push('\n###### ' + node.innerText + '\n'); return; } | |
if (tagName === 'math') { parts.push('$' + node.attributes.alttext.value + '$'); return; } | |
const cssClass = node.attributes && node.attributes.class && node.attributes.class.value; | |
if (cssClass === 'ltx_note_outer') { | |
text = ''; | |
for(const child of node.childNodes) { | |
text += extractText(child) | |
} | |
parts.push(`\n> ${text.replace('\n', '')}\n\n`); | |
} else { | |
for(const child of node.childNodes) { | |
extractText(child, parts) | |
} | |
} | |
if (tagName === 'p' || tagName === 'li' || cssClass === 'ltx_listingline') { | |
parts.push('\n') | |
} | |
if (returnString) { | |
return parts.join(' ') | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment