Spaces:
Runtime error
Runtime error
import { JSDOM, VirtualConsole } from "jsdom"; | |
function removeTags(node: Node) { | |
if (node.hasChildNodes()) { | |
node.childNodes.forEach((childNode) => { | |
if (node.nodeName === "SCRIPT" || node.nodeName === "STYLE") { | |
node.removeChild(childNode); | |
} else { | |
removeTags(childNode); | |
} | |
}); | |
} | |
} | |
function naiveInnerText(node: Node): string { | |
const Node = node; // We need Node(DOM's Node) for the constants, but Node doesn't exist in the nodejs global space, and any Node instance references the constants through the prototype chain | |
return [...node.childNodes] | |
.map((childNode) => { | |
switch (childNode.nodeType) { | |
case Node.TEXT_NODE: | |
return node.textContent; | |
case Node.ELEMENT_NODE: | |
return naiveInnerText(childNode); | |
default: | |
return ""; | |
} | |
}) | |
.join("\n"); | |
} | |
export async function parseWeb(url: string) { | |
const abortController = new AbortController(); | |
setTimeout(() => abortController.abort(), 10000); | |
const htmlString = await fetch(url, { signal: abortController.signal }) | |
.then((response) => response.text()) | |
.catch((err) => console.log(err)); | |
const virtualConsole = new VirtualConsole(); | |
virtualConsole.on("error", () => { | |
// No-op to skip console errors. | |
}); | |
// put the html string into a DOM | |
const dom = new JSDOM(htmlString ?? "", { | |
virtualConsole, | |
}); | |
const body = dom.window.document.querySelector("body"); | |
if (!body) throw new Error("body of the webpage is null"); | |
removeTags(body); | |
// recursively extract text content from the body and then remove newlines and multiple spaces | |
const text = (naiveInnerText(body) ?? "").replace(/ {2}|\r\n|\n|\r/gm, ""); | |
return text; | |
} | |