Spaces:
Sleeping
Sleeping
| import { JSDOM, VirtualConsole } from "jsdom"; | |
| function removeTags(node: Node) { | |
| if (node.hasChildNodes()) { | |
| node.childNodes.forEach((childNode) => { | |
| if (node.nodeName === "SCRIPT" || node.nodeName === "STYLE") { | |
| node.removeChild(childNode); | |
| } else { | |
| removeTags(childNode); | |
| } | |
| }); | |
| } | |
| } | |
| function naiveInnerText(node: Node): string { | |
| const Node = node; // We need Node(DOM's Node) for the constants, but Node doesn't exist in the nodejs global space, and any Node instance references the constants through the prototype chain | |
| return [...node.childNodes] | |
| .map((childNode) => { | |
| switch (childNode.nodeType) { | |
| case Node.TEXT_NODE: | |
| return node.textContent; | |
| case Node.ELEMENT_NODE: | |
| return naiveInnerText(childNode); | |
| default: | |
| return ""; | |
| } | |
| }) | |
| .join("\n"); | |
| } | |
| export async function parseWeb(url: string) { | |
| const abortController = new AbortController(); | |
| setTimeout(() => abortController.abort(), 10000); | |
| const htmlString = await fetch(url, { signal: abortController.signal }) | |
| .then((response) => response.text()) | |
| .catch((err) => console.log(err)); | |
| const virtualConsole = new VirtualConsole(); | |
| virtualConsole.on("error", () => { | |
| // No-op to skip console errors. | |
| }); | |
| // put the html string into a DOM | |
| const dom = new JSDOM(htmlString ?? "", { | |
| virtualConsole, | |
| }); | |
| const body = dom.window.document.querySelector("body"); | |
| if (!body) throw new Error("body of the webpage is null"); | |
| removeTags(body); | |
| // recursively extract text content from the body and then remove newlines and multiple spaces | |
| const text = (naiveInnerText(body) ?? "").replace(/ {2}|\r\n|\n|\r/gm, ""); | |
| return text; | |
| } | |