代码之家  ›  专栏  ›  技术社区  ›  KJ Saxena

如何用PHP制作一个简单的爬虫程序?

  •  63
  • KJ Saxena  · 技术社区  · 16 年前

    我有一个有很多链接的网页。我想写一个脚本,将这些链接中包含的所有数据转储到本地文件中。

    15 回复  |  直到 12 年前
        1
  •  89
  •   David Warthen hobodave    8 年前

    无聊的。不要 parse HTML with regexes .

    <?php
    function crawl_page($url, $depth = 5)
    {
        static $seen = array();
        if (isset($seen[$url]) || $depth === 0) {
            return;
        }
    
        $seen[$url] = true;
    
        $dom = new DOMDocument('1.0');
        @$dom->loadHTMLFile($url);
    
        $anchors = $dom->getElementsByTagName('a');
        foreach ($anchors as $element) {
            $href = $element->getAttribute('href');
            if (0 !== strpos($href, 'http')) {
                $path = '/' . ltrim($href, '/');
                if (extension_loaded('http')) {
                    $href = http_build_url($url, array('path' => $path));
                } else {
                    $parts = parse_url($url);
                    $href = $parts['scheme'] . '://';
                    if (isset($parts['user']) && isset($parts['pass'])) {
                        $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                    }
                    $href .= $parts['host'];
                    if (isset($parts['port'])) {
                        $href .= ':' . $parts['port'];
                    }
                    $href .= dirname($parts['path'], 1).$path;
                }
            }
            crawl_page($href, $depth - 1);
        }
        echo "URL:",$url,PHP_EOL,"CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL;
    }
    crawl_page("http://hobodave.com", 2);
    

    我修复了Tatu版本中的一些bug(现在可以使用相对URL)。

    编辑: 我添加了一个新的功能,防止它跟随同一个URL两次。

    现在将输出回显到标准输出,以便您可以将其重定向到所需的任何文件

    编辑: 修正了乔治在回答中指出的错误。相对url将不再附加到url路径的末尾,而是覆盖它。多亏了乔治。请注意,George的回答没有考虑以下任何因素:https、user、pass或port。如果你有 http PECL扩展加载这是非常简单的使用 http_build_url

        2
  •  16
  •   WonderLand    11 年前

    下面是我基于上述示例/答案的实现。

    1. 它是基于类的
    2. 支持HTTP身份验证
    3. 跳过不属于基本域的Url
    4. 返回每个页面的Http头响应代码
    5. 每页的返回时间

    爬网类:

    class crawler
    {
        protected $_url;
        protected $_depth;
        protected $_host;
        protected $_useHttpAuth = false;
        protected $_user;
        protected $_pass;
        protected $_seen = array();
        protected $_filter = array();
    
        public function __construct($url, $depth = 5)
        {
            $this->_url = $url;
            $this->_depth = $depth;
            $parse = parse_url($url);
            $this->_host = $parse['host'];
        }
    
        protected function _processAnchors($content, $url, $depth)
        {
            $dom = new DOMDocument('1.0');
            @$dom->loadHTML($content);
            $anchors = $dom->getElementsByTagName('a');
    
            foreach ($anchors as $element) {
                $href = $element->getAttribute('href');
                if (0 !== strpos($href, 'http')) {
                    $path = '/' . ltrim($href, '/');
                    if (extension_loaded('http')) {
                        $href = http_build_url($url, array('path' => $path));
                    } else {
                        $parts = parse_url($url);
                        $href = $parts['scheme'] . '://';
                        if (isset($parts['user']) && isset($parts['pass'])) {
                            $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                        }
                        $href .= $parts['host'];
                        if (isset($parts['port'])) {
                            $href .= ':' . $parts['port'];
                        }
                        $href .= $path;
                    }
                }
                // Crawl only link that belongs to the start domain
                $this->crawl_page($href, $depth - 1);
            }
        }
    
        protected function _getContent($url)
        {
            $handle = curl_init($url);
            if ($this->_useHttpAuth) {
                curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
                curl_setopt($handle, CURLOPT_USERPWD, $this->_user . ":" . $this->_pass);
            }
            // follows 302 redirect, creates problem wiht authentication
    //        curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE);
            // return the content
            curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
    
            /* Get the HTML or whatever is linked in $url. */
            $response = curl_exec($handle);
            // response total time
            $time = curl_getinfo($handle, CURLINFO_TOTAL_TIME);
            /* Check for 404 (file not found). */
            $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
    
            curl_close($handle);
            return array($response, $httpCode, $time);
        }
    
        protected function _printResult($url, $depth, $httpcode, $time)
        {
            ob_end_flush();
            $currentDepth = $this->_depth - $depth;
            $count = count($this->_seen);
            echo "N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>";
            ob_start();
            flush();
        }
    
        protected function isValid($url, $depth)
        {
            if (strpos($url, $this->_host) === false
                || $depth === 0
                || isset($this->_seen[$url])
            ) {
                return false;
            }
            foreach ($this->_filter as $excludePath) {
                if (strpos($url, $excludePath) !== false) {
                    return false;
                }
            }
            return true;
        }
    
        public function crawl_page($url, $depth)
        {
            if (!$this->isValid($url, $depth)) {
                return;
            }
            // add to the seen URL
            $this->_seen[$url] = true;
            // get Content and Return Code
            list($content, $httpcode, $time) = $this->_getContent($url);
            // print Result for current Page
            $this->_printResult($url, $depth, $httpcode, $time);
            // process subPages
            $this->_processAnchors($content, $url, $depth);
        }
    
        public function setHttpAuth($user, $pass)
        {
            $this->_useHttpAuth = true;
            $this->_user = $user;
            $this->_pass = $pass;
        }
    
        public function addFilterPath($path)
        {
            $this->_filter[] = $path;
        }
    
        public function run()
        {
            $this->crawl_page($this->_url, $this->_depth);
        }
    }
    

    // USAGE
    $startURL = 'http://YOUR_URL/';
    $depth = 6;
    $username = 'YOURUSER';
    $password = 'YOURPASS';
    $crawler = new crawler($startURL, $depth);
    $crawler->setHttpAuth($username, $password);
    // Exclude path with the following structure to be processed 
    $crawler->addFilterPath('customer/account/login/referer');
    $crawler->run();
    
        3
  •  11
  •   GeekTantra    16 年前
        4
  •  9
  •   Tatu Ulmanen    16 年前

    function crawl_page($url, $depth = 5) {
        if($depth > 0) {
            $html = file_get_contents($url);
    
            preg_match_all('~<a.*?href="(.*?)".*?>~', $html, $matches);
    
            foreach($matches[1] as $newurl) {
                crawl_page($newurl, $depth - 1);
            }
    
            file_put_contents('results.txt', $newurl."\n\n".$html."\n\n", FILE_APPEND);
        }
    }
    
    crawl_page('http://www.domain.com/index.php', 5);
    

    该函数将从页面获取内容,然后爬网所有找到的链接并将内容保存到“results.txt”。这些函数接受第二个参数depth,它定义了链接应该遵循的时间长度。如果只想解析给定页面中的链接,请在此处传递1。

        5
  •  5
  •   Community Mohan Dere    8 年前

    既然可以使用PHP,为什么还要使用PHP呢 wget

    wget -r -l 1 http://www.example.com
    

    有关如何解析内容,请参阅 Best Methods to parse HTML 并使用搜索功能 examples . 如何解析HTML之前已经回答过多次了。

        6
  •  5
  •   Community Mohan Dere    8 年前

    只需对 hobodave's 代码,这里是一个可以用来抓取页面的代码片段。这需要在服务器中启用curl扩展。

    <?php
    //set_time_limit (0);
    function crawl_page($url, $depth = 5){
    $seen = array();
    if(($depth == 0) or (in_array($url, $seen))){
        return;
    }   
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_TIMEOUT, 30);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
    $result = curl_exec ($ch);
    curl_close ($ch);
    if( $result ){
        $stripped_file = strip_tags($result, "<a>");
        preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $stripped_file, $matches, PREG_SET_ORDER ); 
        foreach($matches as $match){
            $href = $match[1];
                if (0 !== strpos($href, 'http')) {
                    $path = '/' . ltrim($href, '/');
                    if (extension_loaded('http')) {
                        $href = http_build_url($href , array('path' => $path));
                    } else {
                        $parts = parse_url($href);
                        $href = $parts['scheme'] . '://';
                        if (isset($parts['user']) && isset($parts['pass'])) {
                            $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                        }
                        $href .= $parts['host'];
                        if (isset($parts['port'])) {
                            $href .= ':' . $parts['port'];
                        }
                        $href .= $path;
                    }
                }
                crawl_page($href, $depth - 1);
            }
    }   
    echo "Crawled {$href}";
    }   
    crawl_page("http://www.sitename.com/",3);
    ?>
    

    我已经在本文中解释了本教程 crawler script tutorial

        7
  •  3
  •   George    15 年前

    <?php
    function crawl_page($url, $depth = 5)
    {
      static $seen = array();
      if (isset($seen[$url]) || $depth === 0) {
        return;
      }
    
      $seen[$url] = true;
    
      $dom = new DOMDocument('1.0');
      @$dom->loadHTMLFile($url);
    
      $anchors = $dom->getElementsByTagName('a');
      foreach ($anchors as $element) {
        $href = $element->getAttribute('href');
        if (0 !== strpos($href, 'http')) {
           /* this is where I changed hobodave's code */
            $host = "http://".parse_url($url,PHP_URL_HOST);
            $href = $host. '/' . ltrim($href, '/');
        }
        crawl_page($href, $depth - 1);
      }
    
      echo "New Page:<br /> ";
      echo "URL:",$url,PHP_EOL,"<br />","CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL,"  <br /><br />";
    }
    
    crawl_page("http://hobodave.com/", 5);
    ?>
    
        8
  •  2
  •   Jens Roland    16 年前

    正如前面提到的,有很多爬虫框架可以定制,但是如果你所做的事情像你提到的那样简单,你可以很容易地从头开始。

    删除链接: http://www.phpro.org/examples/Get-Links-With-DOM.html

    将结果转储到文件: http://www.tizag.com/phpT/filewrite.php

        9
  •  1
  •   Anders Atiqur    10 年前

    <?php
    function crawl_page($url, $depth = 5)
    {
      $parts = parse_url($url);
      if(array_key_exists('fragment', $parts)){
        unset($parts['fragment']);
        $url = http_build_url($parts);
      }
    
      static $seen = array();
      ...
    

    然后您还可以省略 $parts = parse_url($url);

        10
  •  1
  •   Niraj patel    10 年前

    你可以试试这个,也许对你有帮助

    $search_string = 'american golf News: Fowler beats stellar field in Abu Dhabi';
    $html = file_get_contents(url of the site);
    $dom = new DOMDocument;
    $titalDom = new DOMDocument;
    $tmpTitalDom = new DOMDocument;
    libxml_use_internal_errors(true);
    @$dom->loadHTML($html);
    libxml_use_internal_errors(false);
    $xpath = new DOMXPath($dom);
    $videos = $xpath->query('//div[@class="primary-content"]');
    foreach ($videos as $key => $video) {
    $newdomaindom = new DOMDocument;    
    $newnode = $newdomaindom->importNode($video, true);
    $newdomaindom->appendChild($newnode);
    @$titalDom->loadHTML($newdomaindom->saveHTML());
    $xpath1 = new DOMXPath($titalDom);
    $titles = $xpath1->query('//div[@class="listingcontainer"]/div[@class="list"]');
    if(strcmp(preg_replace('!\s+!',' ',  $titles->item(0)->nodeValue),$search_string)){     
        $tmpNode = $tmpTitalDom->importNode($video, true);
        $tmpTitalDom->appendChild($tmpNode);
        break;
    }
    }
    echo $tmpTitalDom->saveHTML();
    
        11
  •  1
  •   Dov Jacobson    8 年前

    谢谢你@hobodave。

    然而,我发现你的代码有两个弱点。 为了得到“主机”段,对原始url的解析在第一个斜杠处停止。这假定所有相关链接都从根目录开始。这只是有时候是真的。

    original url   :  http://example.com/game/index.html
    href in <a> tag:  highscore.html
    author's intent:  http://example.com/game/highscore.html  <-200->
    crawler result :  http://example.com/highscore.html       <-404->
    

    通过在最后一个斜杠而不是第一个斜杠处打断来修复此问题

    $depth 不是真正跟踪递归深度,而是跟踪 递归的第一级。

    如果我相信这个页面正在被积极使用,我可能会调试第二个版本,但我怀疑我现在写的文本永远不会被任何人阅读,无论是人类还是机器人,因为这个版本已经有六年了,我甚至没有足够的声誉通过对他的代码进行注释直接通知+hobodave这些缺陷。无论如何,谢谢你。

        12
  •  0
  •   Ian    8 年前

    我想出了下面的蜘蛛代码。 PHP - Is the there a safe way to perform deep recursion? 似乎相当快。。。。

        <?php
    function  spider( $base_url , $search_urls=array() ) {
        $queue[] = $base_url;
        $done           =   array();
        $found_urls     =   array();
        while($queue) {
                $link = array_shift($queue);
                if(!is_array($link)) {
                    $done[] = $link;
                    foreach( $search_urls as $s) { if (strstr( $link , $s )) { $found_urls[] = $link; } }
                    if( empty($search_urls)) { $found_urls[] = $link; }
                    if(!empty($link )) {
    echo 'LINK:::'.$link;
                          $content =    file_get_contents( $link );
    //echo 'P:::'.$content;
                        preg_match_all('~<a.*?href="(.*?)".*?>~', $content, $sublink);
                        if (!in_array($sublink , $done) && !in_array($sublink , $queue)  ) {
                               $queue[] = $sublink;
                        }
                    }
                } else {
                        $result=array();
                        $return = array();
                        // flatten multi dimensional array of URLs to one dimensional.
                        while(count($link)) {
                             $value = array_shift($link);
                             if(is_array($value))
                                 foreach($value as $sub)
                                    $link[] = $sub;
                             else
                                   $return[] = $value;
                         }
                         // now loop over one dimensional array.
                         foreach($return as $link) {
                                    // echo 'L::'.$link;
                                    // url may be in form <a href.. so extract what's in the href bit.
                                    preg_match_all('/<a[^>]+href=([\'"])(?<href>.+?)\1[^>]*>/i', $link, $result);
                                    if ( isset( $result['href'][0] )) { $link = $result['href'][0]; }
                                    // add the new URL to the queue.
                                    if( (!strstr( $link , "http")) && (!in_array($base_url.$link , $done)) && (!in_array($base_url.$link , $queue)) ) {
                                         $queue[]=$base_url.$link;
                                    } else {
                                        if ( (strstr( $link , $base_url  ))  && (!in_array($base_url.$link , $done)) && (!in_array($base_url.$link , $queue)) ) {
                                             $queue[] = $link;
                                        }
                                    }
                          }
                }
        }
    
    
        return $found_urls;
    }    
    
    
        $base_url       =   'https://www.houseofcheese.co.uk/';
        $search_urls    =   array(  $base_url.'acatalog/' );
        $done = spider( $base_url  , $search_urls  );
    
        //
        // RESULT
        //
        //
        echo '<br /><br />';
        echo 'RESULT:::';
        foreach(  $done as $r )  {
            echo 'URL:::'.$r.'<br />';
        }
    
        13
  •  0
  •   Antony    8 年前

    值得记住的是,在抓取外部链接时(我很欣赏OP与用户自己的页面相关),您应该知道robots.txt。我发现了以下几点,希望能有所帮助 http://www.the-art-of-web.com/php/parse-robots/ .

        14
  •  0
  •   TURTLE    8 年前

    我创建了一个小类,从提供的url获取数据,然后提取您选择的html元素。该类使用CURL和DOMDocument。

    class crawler {
    
    
       public static $timeout = 2;
       public static $agent   = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
    
    
       public static function http_request($url) {
          $ch = curl_init();
          curl_setopt($ch, CURLOPT_URL,            $url);
          curl_setopt($ch, CURLOPT_USERAGENT,      self::$agent);
          curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, self::$timeout);
          curl_setopt($ch, CURLOPT_TIMEOUT,        self::$timeout);
          curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
          $response = curl_exec($ch);
          curl_close($ch);
          return $response;
       }
    
    
       public static function strip_whitespace($data) {
          $data = preg_replace('/\s+/', ' ', $data);
          return trim($data);
       }
    
    
       public static function extract_elements($tag, $data) {
          $response = array();
          $dom      = new DOMDocument;
          @$dom->loadHTML($data);
          foreach ( $dom->getElementsByTagName($tag) as $index => $element ) {
             $response[$index]['text'] = self::strip_whitespace($element->nodeValue);
             foreach ( $element->attributes as $attribute ) {
                $response[$index]['attributes'][strtolower($attribute->nodeName)] = self::strip_whitespace($attribute->nodeValue);
             }
          }
          return $response;
       }
    
    
    }
    

    用法示例:

    $data  = crawler::http_request('https://stackoverflow.com/questions/2313107/how-do-i-make-a-simple-crawler-in-php');
    $links = crawler::extract_elements('a', $data);
    if ( count($links) > 0 ) {
       file_put_contents('links.json', json_encode($links, JSON_PRETTY_PRINT));
    }
    

    答复示例:

    [
        {
            "text": "Stack Overflow",
            "attributes": {
                "href": "https:\/\/stackoverflow.com",
                "class": "-logo js-gps-track",
                "data-gps-track": "top_nav.click({is_current:false, location:2, destination:8})"
            }
        },
        {
            "text": "Questions",
            "attributes": {
                "id": "nav-questions",
                "href": "\/questions",
                "class": "-link js-gps-track",
                "data-gps-track": "top_nav.click({is_current:true, location:2, destination:1})"
            }
        },
        {
            "text": "Developer Jobs",
            "attributes": {
                "id": "nav-jobs",
                "href": "\/jobs?med=site-ui&ref=jobs-tab",
                "class": "-link js-gps-track",
                "data-gps-track": "top_nav.click({is_current:false, location:2, destination:6})"
            }
        }
    ]
    
        15
  •  0
  •   zstate    7 年前

    这是个老问题。从那以后发生了很多好事。这是我在这个话题上的两分钱:

    1. 要准确跟踪访问的页面,您必须首先规范化URI。标准化算法包括多个步骤:

      • GET http://www.example.com/query?id=111&cat=222 GET http://www.example.com/query?cat=222&id=111
      • 转换空路径。 例子: http://example.org → http://example.org/

      • 例子: http://example.org/a%c2%B1b → http://example.org/a%C2%B1b

      • 删除不必要的点段。 例子: http://example.org/../a/b/../c/./d.html → http://example.org/a/c/d.html

    2. 不仅 <a> href 属性 <area> 标签上也有 https://html.com/tags/area/ <面积> 标签也是。

    3. 善待网站所有者。

    如果你不想处理这个问题,试试看 Crawlzone https://www.codementor.io/zstate/this-is-how-i-crawl-n98s6myxm

    推荐文章