我有一个算法来抓取网页,这样一旦用户输入URL,我就可以从中获取og数据。这适用于大多数页面。然而,每当我试图抓取CNN文章时,它就会产生一个错误,导致我的节点服务器崩溃。
例如,最近。。。
https://www.cnn.com/2022/01/14/investing/dogecoin-elon-musk-tesla/index.html
导致服务器崩溃。
web validator
显示它有很多错误——包括与元数据相关的错误。
错误似乎来自Dom解析器。
throw new ParseError(error, this.locator);
^
Error [ParseError]: Attribute id redefined
这是我的服务器代码。我很难确定这里出了什么问题以及如何解决。
const express = require("express");
const bodyParser = require("body-parser");
const axios = require("axios");
const xpath = require("xpath");
const { DOMParser } = require("xmldom");
const cors = require("cors");
const app = express();
app.use(bodyParser.json());
app.use(cors());
app.post("/scrape", (req, res) => {
const { body } = req;
const { url } = body;
return parseUrl(url).then((result) => res.json(result));
});
app.listen(3001, () => console.log("OG Scraper Listening..."));
const xpaths = {
title: 'string(//meta[@property="og:title"]/@content)',
description: 'string(//meta[@property="og:description"]/@content)',
image: 'string(//meta[@property="og:image"]/@content)',
keywords: 'string(//meta[@property="og:keywords"]/@content)',
url: 'string(//meta[@property="og:url"]/@content)',
published_time: 'string(//meta[@property="article:published_time"]/@content)',
modified_time: 'string(//meta[@property="article:modified_time"]/@content)',
published: 'string(//meta[@property="article:published"]/@content)',
modified: 'string(//meta[@property="article:modified"]/@content)',
site_name: 'string(//meta[@property="og:site_name"]/@content)',
locale: 'string(//meta[@property="og:locale"]/@content)',
content_tier: 'string(//meta[@property="article:content_tier"]/@content)',
opinion: 'string(//meta[@property="article:opinion"]/@content)',
type: 'string(//meta[@property="og:type"]/@content)',
};
const retrievePage = (url) => axios.request({ url });
const convertBodyToDocument = (body) => new DOMParser().parseFromString(body);
const nodesFromDocument = (document, xpathselector) => {
return xpath.select(xpathselector, document);
};
const mapProperties = (paths, document) =>
Object.keys(paths).reduce(
(acc, key) => ({ ...acc, [key]: nodesFromDocument(document, paths[key]) }),
{}
);
const parseUrl = (url) =>
retrievePage(url).then((response) => {
const validData = response.data;
const document = convertBodyToDocument(validData);
const mappedProperties = mapProperties(xpaths, document);
return mappedProperties;
});