安装程序
df
Dyad Participant Timestep Tokens
0 1 A 1 apple,banana
1 1 B 1 apple,orange
2 1 A 2 banana
3 1 B 2 orange,kumquat
4 1 A 3 orange
5 1 B 3 orange,pear
6 2 A 1 orange,pear
7 2 B 1 apple,banana,pear
8 2 A 2 banana,persimmon
9 2 B 2 apple
10 2 A 3 banana
11 2 B 3 apple
tokens = df.Tokens.str.split(',', expand=False).apply(frozenset)
tokens
0 (apple, banana)
1 (orange, apple)
2 (banana)
3 (orange, kumquat)
4 (orange)
5 (orange, pear)
6 (orange, pear)
7 (apple, banana, pear)
8 (persimmon, banana)
9 (apple)
10 (banana)
11 (apple)
Name: Tokens, dtype: object
# union logic - https://stackoverflow.com/a/46402781/4909087
df = df.assign(Tokens=tokens)\
.groupby(['Dyad', 'Participant']).apply(\
lambda x: (x.Tokens.str.len() -
x.Tokens.diff().str.len()) \
/ pd.Series([len(k[0].union(k[1]))
for k in zip(x.Tokens, x.Tokens.shift(1).fillna(''))], index=x.index))\
.reset_index(level=[0, 1], name='TokenOverlap')\
.assign(Timestep=df.Timestep, Tokens=df.Tokens)\
.sort_values(['Dyad', 'Timestep', 'Participant'])\
.fillna('(no value)')\
[['Dyad', 'Participant', 'Timestep', 'Tokens', 'TokenOverlap']]
df
Dyad Participant Timestep Tokens TokenOverlap
0 1 A 1 apple,banana (no value)
1 1 B 1 apple,orange (no value)
2 1 A 2 banana 0.5
3 1 B 2 orange,kumquat 0.333333
4 1 A 3 orange 0
5 1 B 3 orange,pear 0.333333
6 2 A 1 orange,pear (no value)
7 2 B 1 apple,banana,pear (no value)
8 2 A 2 banana,persimmon 0
9 2 B 2 apple 0.333333
10 2 A 3 banana 0.5
11 2 B 3 apple 1
简而言之,这段代码的作用是按
Dyad
Participant
,然后求成对比。这需要一些复杂的
groupby
apply
,因为我们需要做几组
union
和
difference
操作。核心逻辑在
groupby.apply
剩下的只是修饰。
10 loops, best of 3: 19.2 ms per loop
df2 = df.assign(Tokens=tokens)
df2 = df2.groupby(['Dyad', 'Participant']).apply(\
lambda x: (x.Tokens.str.len() -
x.Tokens.diff().str.len()) \
/ pd.Series([len(k[0].union(k[1]))
for k in zip(x.Tokens, x.Tokens.shift(1).fillna(''))], index=x.index)) # the for loop is part of this huge line
df2 = df2.reset_index(level=[0, 1], name='TokenOverlap')
df2 = df2.assign(Timestep=df.Timestep, Tokens=df.Tokens)
df2 = df2.sort_values(['Dyad', 'Timestep', 'Participant']).fillna('(no value)')
df2 = df2[['Dyad', 'Participant', 'Timestep', 'Tokens', 'TokenOverlap']]