我正试图从中抓取网址
https://en.wikipedia.org/wiki/List_of_hedge_funds
通过使用
an Apify actor called "web-scraper"
(
https://apify.com/apify/web-scraper
)
具体来说,我试图使用以下Apify
pageFunction
抓取该目标页面,并从HTML中存在的锚点标签返回URL列表。
page功能
async function pageFunction( context ) {
const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
const cssSelector = 'tr > td > a';
const $ = context.jQuery;
const pageTitle = $('title').first().text();
const anchorTag = $( cssSelector );
return {
url: context.request.url,
pageTitle, anchorTag,
};
}
在我的控制台中,我希望看到
href
在名为的属性中存在于目标页面上的一个或多个锚点标记的属性
anchorTag
。我还希望在名为的属性中看到页面标题
pageTitle
以及
url
财产。具体如下:
我希望看到的是:
{
"url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
"pageTitle": "List of hedge funds - Wikipedia",
"anchorTag": {
"0": "http://example0.com", // each instance of "http://example.com" represents a unique url on the target page to be scraped
"1": "http://example1.com",
"2": "http://example2.com",
"3": "http://example3.com",
...
"39": "http://example39.com",
}}
但是,actor返回的不是URL列表,而是以下数据集:
我实际看到的:
[{
"url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
"pageTitle": "List of hedge funds - Wikipedia",
"anchorTag": {
"0": {},
"1": {},
"2": {},
"3": {},
"4": {},
"5": {},
"6": {},
"7": {},
"8": {},
"9": {},
"10": {},
"11": {},
"12": {},
"13": {},
"14": {},
"15": {},
"16": {},
"17": {},
"18": {},
"19": {},
"20": {},
"21": {},
"22": {},
"23": {},
"24": {},
"25": {},
"26": {},
"27": {},
"28": {},
"29": {},
"30": {},
"31": {},
"32": {},
"33": {},
"34": {},
"35": {},
"36": {},
"37": {},
"38": {},
"39": {},
"length": 40,
"prevObject": {
"0": {
"location": {
"href": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
"ancestorOrigins": {},
"origin": "https://en.wikipedia.org",
"protocol": "https:",
"host": "en.wikipedia.org",
"hostname": "en.wikipedia.org",
"port": "",
"pathname": "/wiki/List_of_hedge_funds",
"search": "",
"hash": "",
"assign": {},
"reload": {},
"toString": {},
"replace": {}
},
"write": {},
"writeln": {},
"jQuery3410461525655351679551": {
"events": {
"mmv-setup-overlay": [
{
"type": "mmv-setup-overlay",
"origType": "mmv-setup-overlay",
"handler": {
"guid": 21
},
"guid": 21,
"namespace": ""
}
],
"mmv-cleanup-overlay": [
{
"type": "mmv-cleanup-overlay",
"origType": "mmv-cleanup-overlay",
"handler": {
"guid": 22
},
"guid": 22,
"namespace": ""
}
],
"keyup": [
{
"type": "keyup",
"origType": "keyup",
"handler": {
"guid": 24
},
"guid": 24,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"mouseover": [
{
"type": "mouseover",
"origType": "mouseover",
"handler": {
"guid": 24
},
"guid": 24,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"focusout": [
{
"type": "focusout",
"origType": "blur",
"handler": {
"guid": 25
},
"guid": 25,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"mouseout": [
{
"type": "mouseout",
"origType": "mouseout",
"handler": {
"guid": 25
},
"guid": 25,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"click": [
{
"type": "click",
"origType": "click",
"handler": {
"guid": 26
},
"guid": 26,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
]
},
"handle": {},
"focusin": 1,
"focusout": 1
}
},
"length": 1
}
}
}]
我做错了什么?