代码之家  ›  专栏  ›  技术社区  ›  guyabel

从html表中提取文本和链接

  •  6
  • guyabel  · 技术社区  · 7 年前

    enter image description here

    我正在尝试从R中提取文本和链接。

    # write the HTML code from R to reproduce 
    x <- "
    <html>
    <head>
    </head>
    <body>
    <table>
    <tbody>
    <tr>
    <th>site</th>
    <th>country</th>
    </tr>
    <tr>
    <td> <a href='http://www.nbc.com'>NBC</a>   <a href='https://www.cnn.com'>CNN</a>  <a href='https://www.nytimes.com'>NY Times</a> </td>
    <td> US</td>
    </tr>
    <tr>
    <td> <a href='http://www.dw-world.de/'>DW</a>  </td>
    <td> DE</td>
    </tr>
    <tr>
    <td></td>
    <td>FR</td>
    </tr>
    <tr>
    <td> <a href='http://www.bbc.co.uk'>BBC</a>   <a href='https://www.itv.co.uk'>ITV</a></td>
    <td> UK</td>
    </tr>
    </tbody>
    </table>
    </body>
    </html>"
    write.table(x = x, file = "table.html",  quote = FALSE,
                col.names = FALSE,
                row.names = FALSE)
    file.show("table.html")
    

    最终我想要像这样整洁的数据框。。。

    # # A tibble: 7 x 3
    #   site                    site_name country
    #   <chr>                   <chr>     <chr>  
    # 1 http://www.nbc.com      NBC       US     
    # 2 https://www.cnn.com     CNN       US     
    # 3 https://www.nytimes.com NY Times  US     
    # 4 http://www.dw-world.de/ DW        DE     
    # 5 NA                      NA        FR     
    # 6 http://www.bbc.co.uk    BBC       UK     
    # 7 https://www.itv.co.uk   ITV       UK 
    

    我一直在玩 rvest

    library(tidyverse)
    library(rvest)
    
    h <- read_html("table.html") 
    
    # a table without any of the links... no good
    h %>%
      html_table() %>%
      .[[1]]
    #                  site country
    # 1 NBC   CNN  NY Times      US
    # 2                  DW      DE
    # 3                          FR
    # 4           BBC   ITV      UK
    
    # pulls the site urls    
    h %>%
      html_nodes("a") %>% 
      html_attr("href")
    # [1] "http://www.nbc.com"      "https://www.cnn.com"     "https://www.nytimes.com" "http://www.dw-world.de/" "http://www.bbc.co.uk"    "https://www.itv.co.uk"     
    
    # pulls the site names
    h %>%
      html_nodes("a") %>% 
      html_text()
    # [1] "NBC"      "CNN"      "NY Times" "DW"       "BBC"      "ITV" 
    
    # looks promising, perhaps can combine with results from html_table()
    library(XML)
    tables <- getNodeSet(htmlParse("table.html"), "//table")
    hrefFun <- function(x){
      xpathSApply(x,'./a',xmlAttrs)  
    }
    readHTMLTable(doc = tables[[1]], elFun = hrefFun)
    # V1     V2
    # 1                                                                                         list() list()
    # 2 c(href = "http://www.nbc.com", href = "https://www.cnn.com", href = "https://www.nytimes.com") list()
    # 3                                                                        http://www.dw-world.de/ list()
    # 4                                                                                         list() list()
    # 5                               c(href = "http://www.bbc.co.uk", href = "https://www.itv.co.uk") list()
    
    # looks promising for the rows.. don't know where to go from here
    h %>%
      html_nodes("tr")
    # {xml_nodeset (5)}
    # [1] <tr>\n<th>site</th>\r\n<th>country</th>\r\n</tr>\n
    # [2] <tr>\n<td> <a href="http://www.nbc.com">NBC</a>   <a href="https://www.cnn.com">CNN</a>  <a href="https://www.nytimes.com">NY Times</a>  ...
    # [3] <tr>\n<td> <a href="http://www.dw-world.de/">DW</a>  </td>\r\n<td> DE</td>\r\n</tr>\n
    # [4] <tr>\n<td></td>\r\n<td>FR</td>\r\n</tr>\n
    # [5] <tr>\n<td> <a href="http://www.bbc.co.uk">BBC</a>   <a href="https://www.itv.co.uk">ITV</a>\n</td>\r\n<td> UK</td>\r\n</tr>
    
    2 回复  |  直到 7 年前
        1
  •  2
  •   Yifu Yan    7 年前

    您可以使用三个步骤来完成:

    1. 创建一个包含所有映射的表。
    2. 把那两张桌子连起来

    第一步:

    #create mappings 
    nodes_with_info <- read_html(x) %>%
        html_nodes("a")
    map_table <- tibble(site = nodes_with_info %>%
                            html_text() %>%
                            str_trim(),
                        href = nodes_with_info %>%
                            html_attr("href"))
    

    结果:

    site     href                   
    <chr>    <chr>                  
    1 NBC      http://www.nbc.com     
    2 CNN      https://www.cnn.com    
    3 NY Times https://www.nytimes.com
    4 DW       http://www.dw-world.de/
    5 BBC      http://www.bbc.co.uk   
    6 ITV      https://www.itv.co.uk  
    

    第二步:

    my_table <- read_html(x) %>%
        html_table() %>%
        .[[1]]
    
    #create table of desired format
    structured_table <- my_table %>%
        mutate(site = str_split(site,"  ")) %>%
        unnest() %>%
        mutate(site = str_trim(site))
    

    country     site
    1      US      NBC
    2      US      CNN
    3      US NY Times
    4      DE       DW
    5      FR         
    6      UK      BBC
    7      UK      ITV
    

    structured_table %>%
        left_join(map_table,by = "site")
    

    结果:

    country     site                    href
    1      US      NBC      http://www.nbc.com
    2      US      CNN     https://www.cnn.com
    3      US NY Times https://www.nytimes.com
    4      DE       DW http://www.dw-world.de/
    5      FR                             <NA>
    6      UK      BBC    http://www.bbc.co.uk
    7      UK      ITV   https://www.itv.co.uk
    
        2
  •  0
  •   DJack    7 年前

    当然还有改进的余地,但它是有效的:

    extract_data <- function(url) {
      table <- read_html(url) %>%
          html_table() %>%
          data.frame()
    
      site <- read_html(url) %>%
        html_nodes("a") %>%
        html_text()
    
      site_name <- read_html(url) %>%
        html_nodes("a") %>%
        html_attr("href")
    
      df <- data.frame(site_name = site_name, site = site, stringsAsFactors = F)
    
      df$country <- table$country[apply(df, 1, function(x) grep(x[2], table$site))]
    
      emptyRow <- data.frame(site = rep(NA, length(table$country[table$site ==""])),
                             site_name = rep(NA, length(table$country[table$site ==""])),
                             country = table$country[table$site ==""])
      df <- rbind(df, emptyRow)
      df
    }
    
    extract_data("table.html")
    
    #                site_name     site country
    #1      http://www.nbc.com      NBC      US
    #2     https://www.cnn.com      CNN      US
    #3 https://www.nytimes.com NY Times      US
    #4 http://www.dw-world.de/       DW      DE
    #5    http://www.bbc.co.uk      BBC      UK
    #6   https://www.itv.co.uk      ITV      UK
    #7                    <NA>     <NA>      FR