代码之家  ›  专栏  ›  技术社区  ›  NoUsername9

使用Apify和Puppeter抓取URL

  •  1
  • NoUsername9  · 技术社区  · 5 年前

    我正试图从中抓取网址 https://en.wikipedia.org/wiki/List_of_hedge_funds 通过使用 an Apify actor called "web-scraper" ( https://apify.com/apify/web-scraper )

    具体来说,我试图使用以下Apify pageFunction 抓取该目标页面,并从HTML中存在的锚点标签返回URL列表。

    page功能
    async function pageFunction( context ) {
        const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
        const cssSelector = 'tr > td > a';
    
        const $ = context.jQuery;
        const pageTitle = $('title').first().text();
        const anchorTag = $( cssSelector );
    
        return {
          url: context.request.url,
          pageTitle, anchorTag,
        };
    }
    

    在我的控制台中,我希望看到 href 在名为的属性中存在于目标页面上的一个或多个锚点标记的属性 anchorTag 。我还希望在名为的属性中看到页面标题 pageTitle 以及 url 财产。具体如下:

    我希望看到的是:
    {
      "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
      "pageTitle": "List of hedge funds - Wikipedia",
      "anchorTag": {
        "0": "http://example0.com", // each instance of "http://example.com" represents a unique url on the target page to be scraped
        "1": "http://example1.com",
        "2": "http://example2.com",
        "3": "http://example3.com",
        ...
        "39": "http://example39.com",
    }}
    

    但是,actor返回的不是URL列表,而是以下数据集:

    我实际看到的:
    [{
      "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
      "pageTitle": "List of hedge funds - Wikipedia",
      "anchorTag": {
        "0": {},
        "1": {},
        "2": {},
        "3": {},
        "4": {},
        "5": {},
        "6": {},
        "7": {},
        "8": {},
        "9": {},
        "10": {},
        "11": {},
        "12": {},
        "13": {},
        "14": {},
        "15": {},
        "16": {},
        "17": {},
        "18": {},
        "19": {},
        "20": {},
        "21": {},
        "22": {},
        "23": {},
        "24": {},
        "25": {},
        "26": {},
        "27": {},
        "28": {},
        "29": {},
        "30": {},
        "31": {},
        "32": {},
        "33": {},
        "34": {},
        "35": {},
        "36": {},
        "37": {},
        "38": {},
        "39": {},
        "length": 40,
        "prevObject": {
          "0": {
            "location": {
              "href": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
              "ancestorOrigins": {},
              "origin": "https://en.wikipedia.org",
              "protocol": "https:",
              "host": "en.wikipedia.org",
              "hostname": "en.wikipedia.org",
              "port": "",
              "pathname": "/wiki/List_of_hedge_funds",
              "search": "",
              "hash": "",
              "assign": {},
              "reload": {},
              "toString": {},
              "replace": {}
            },
            "write": {},
            "writeln": {},
            "jQuery3410461525655351679551": {
              "events": {
                "mmv-setup-overlay": [
                  {
                    "type": "mmv-setup-overlay",
                    "origType": "mmv-setup-overlay",
                    "handler": {
                      "guid": 21
                    },
                    "guid": 21,
                    "namespace": ""
                  }
                ],
                "mmv-cleanup-overlay": [
                  {
                    "type": "mmv-cleanup-overlay",
                    "origType": "mmv-cleanup-overlay",
                    "handler": {
                      "guid": 22
                    },
                    "guid": 22,
                    "namespace": ""
                  }
                ],
                "keyup": [
                  {
                    "type": "keyup",
                    "origType": "keyup",
                    "handler": {
                      "guid": 24
                    },
                    "guid": 24,
                    "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                    "needsContext": false,
                    "namespace": ""
                  }
                ],
                "mouseover": [
                  {
                    "type": "mouseover",
                    "origType": "mouseover",
                    "handler": {
                      "guid": 24
                    },
                    "guid": 24,
                    "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                    "needsContext": false,
                    "namespace": ""
                  }
                ],
                "focusout": [
                  {
                    "type": "focusout",
                    "origType": "blur",
                    "handler": {
                      "guid": 25
                    },
                    "guid": 25,
                    "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                    "needsContext": false,
                    "namespace": ""
                  }
                ],
                "mouseout": [
                  {
                    "type": "mouseout",
                    "origType": "mouseout",
                    "handler": {
                      "guid": 25
                    },
                    "guid": 25,
                    "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                    "needsContext": false,
                    "namespace": ""
                  }
                ],
                "click": [
                  {
                    "type": "click",
                    "origType": "click",
                    "handler": {
                      "guid": 26
                    },
                    "guid": 26,
                    "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                    "needsContext": false,
                    "namespace": ""
                  }
                ]
              },
              "handle": {},
              "focusin": 1,
              "focusout": 1
            }
          },
          "length": 1
        }
      }
    }]
    

    我做错了什么?

    1 回复  |  直到 5 年前
        1
  •  2
  •   LukáÅ¡ Křivka    5 年前

    您必须访问 href 属性 a 标签以获取URL。此外,您需要遍历所有 标签将它们放入一个数组中。

    // ...
    const anchorTag = $( cssSelector );
    const links = [];
    
    // anchorTag in a JQuery handle, not a normal JavaScript value so it has special JQuery methods
    anchorTag.each((index, el) => {
        const link = $(el).attr('href');
        if (link) {
             links.push(link);
        }
    })
    
    return {
       url: context.request.url,
       pageTitle,
       links,
    };