代码之家 › 专栏 › 技术社区 › CodeNoob

将字符串拆分为单个字母并记住位置

split string r

CodeNoob · 技术社区 · 7 年前

我有这样一个数据集:

# test data
test.table <- data.frame(
  id = seq(1,3),
  sequence = c('HELLOTHISISASTRING','STRING|IS||18|LONG','SOMEOTHERSTRING!!!')
)

每个序列具有相同的长度(18)。现在我想创建一个如下表:

#id  position letter
#1   1        H
#1   2        E
#1   3        L
#.....etc

虽然我知道我可以用 strsplit

splitted <- strsplit(as.character(test.table$sequence), '')

我不知道该怎么把它转换成我喜欢的格式?

7 回复 | 直到 7 年前

Sotos 7 年前

splitstackshape

library(splitstackshape)

dt1 <- cSplit(test.table, 'sequence', sep = '', direction = 'long', stripWhite = FALSE)
dt1$pos <- seq(18)

所以,

    id sequence pos
 1:  1        H   1
 2:  1        E   2
 3:  1        L   3
 4:  1        L   4
 5:  1        O   5
 6:  1        T   6
 7:  1        H   7
 8:  1        I   8
 9:  1        S   9
10:  1        I  10
...

Jaap 7 年前

df <- stack(setNames(strsplit(as.character(test.table$sequence), ""), test.table$id))[2:1]
df$pos <- with(df, ave(values, ind, FUN = seq_along))

它给出:

> df
   ind values pos
1    1      H   1
2    1      E   2
3    1      L   3
4    1      L   4
5    1      O   5
6    1      T   6
7    1      H   7
8    1      I   8
....

或使用 data.table :

library(data.table)
setDT(test.table)

test.table[, .(letter = unlist(tstrsplit(sequence, "", fixed=TRUE))), id
           ][, pos := rowid(id)][]

结果是一样的:

    id letter pos
 1:  1      H   1
 2:  1      E   2
 3:  1      L   3
 4:  1      L   4
 5:  1      O   5
 6:  1      T   6
 7:  1      H   7
 8:  1      I   8
....

Axeman 7 年前

你可以用 tidyverse

test.table <- data.frame(
  id = seq(1,3),
  sequence = c('HELLOTHISISASTRING','STRING|IS||18|LONG','SOMEOTHERSTRING!!!')
)
library(tidyverse)

test.table %>%
  mutate(letters = str_split(sequence, "")) %>%
  unnest %>%
  group_by(id, sequence) %>%
  mutate(position = row_number())
#> # A tibble: 54 x 4
#> # Groups:   id, sequence [3]
#>       id sequence           letters position
#>    <int> <fct>              <chr>      <int>
#>  1     1 HELLOTHISISASTRING H              1
#>  2     1 HELLOTHISISASTRING E              2
#>  3     1 HELLOTHISISASTRING L              3
#>  4     1 HELLOTHISISASTRING L              4
#>  5     1 HELLOTHISISASTRING O              5
#>  6     1 HELLOTHISISASTRING T              6
#>  7     1 HELLOTHISISASTRING H              7
#>  8     1 HELLOTHISISASTRING I              8
#>  9     1 HELLOTHISISASTRING S              9
#> 10     1 HELLOTHISISASTRING I             10
#> # ... with 44 more rows

创建日期:2018-09-07 reprex package (第0.2.0版)。

Saurabh Chauhan 7 年前

用这个试试 stringi 包裹:

library(stringi)
data=data.frame()
for(i in 1:nrow(test.table)){ # For each id
 # Split the data for each index and store the itermediate result and 
 # bind it as id, position and letter
 df=cbind(test.table$id[i],1: stri_length(test.table$sequence[i]),stri_sub(test.table$sequence[i],
     seq(1, stri_length(test.table$sequence[i]),by=1), length=1))
 data=rbind(data,df) # Append each id result to data
} 
colnames(data)=c('id','position','letter')

输出:

  id position letter
1  1        1      H
2  1        2      E
3  1        3      L
4  1        4      L
5  1        5      O
6  1        6      T

Vlad C. 7 年前

这里已经有一些很好的答案了,但是这里有另一种方法可以使用 tidyverse .

test.table <- data.frame(
  id = seq(1,3),
  sequence = c('HELLOTHISISASTRING','STRING|IS||18|LONG','SOMEOTHERSTRING!!!')
)

library(tidyverse)
library(reshape2)

test.table %>% 
  separate(col=sequence, into=as.character(1:18), sep=1:17) %>% 
  melt('id', value.name = 'letter', variable.name='position') %>% 
  arrange(id, position)

在上述代码中 separate 函数来自 tidyr 分隔 sequence 列为18个独立的列(命名为1到18),然后这些列被融化到 letter 和 position

zx8754 7 年前

chartr("HES", "ZXY", test.table$sequence)
# [1] "ZXLLOTZIYIYAYTRING" "YTRING|IY||18|LONG" "YOMXOTZXRYTRING!!!"

AndS. 7 年前

这是一个主题的另一个变体。

library(tidyverse)

test.table %>% 
  nest(-id) %>% 
  mutate(letters = map(data, ~str_split(.x$sequence,'') %>% unlist()),
         numbers = map(letters, ~1:length(.x))) %>%
  unnest(letters, numbers)
#> # A tibble: 54 x 3
#>       id letters numbers
#>    <int> <chr>     <int>
#>  1     1 H             1
#>  2     1 E             2
#>  3     1 L             3
#>  4     1 L             4
#>  5     1 O             5
#>  6     1 T             6
#>  7     1 H             7
#>  8     1 I             8
#>  9     1 S             9
#> 10     1 I            10
#> # ... with 44 more rows

或略有不同,以避免2个调用映射

test.table %>% 
  nest(-id) %>% 
  mutate(newdata = map(data, ~data_frame(
    letters = str_split(.x$sequence, "") %>% unlist(),
    numbers = 1:str_count(.x$sequence)))) %>%
  unnest(newdata)
#> # A tibble: 54 x 3
#>       id letters numbers
#>    <int> <chr>     <int>
#>  1     1 H             1
#>  2     1 E             2
#>  3     1 L             3
#>  4     1 L             4
#>  5     1 O             5
#>  6     1 T             6
#>  7     1 H             7
#>  8     1 I             8
#>  9     1 S             9
#> 10     1 I            10
#> # ... with 44 more rows

创建日期:2018-09-07 reprex package (第0.2.0版)。