选项1
我会用
groupby
和
transform
具有
first
.
使改变
将在所有实例中广播第一个遇到的值
组的。
df.assign(
Account_Number=
df.groupby('Account_Number')
.Dummy_Account
.transform('first')
)
Account_Number Dummy_Account
0 ACC0000000000001 ACC0000000000001
1 ACC0000000000002 ACC0000000000002
2 ACC0000000000001 ACC0000000169532
3 ACC0000000123005 ACC0000000123005
4 ACC0000000000004 ACC0000000000004
5 ACC0000000000005 ACC0000000000005
6 ACC0000000123005 ACC0000000000040
选项2
使用Numpy的
np.unique
获取第一个值的索引和一个逆。
索引(
idx
)确定
'Account_Number'
发生。我用这个来切
'Dummy_Account'
. 然后使用逆数组(
inv
)旨在将唯一值放回原位,但我将其用于重合数组中处于相同位置的对象。
u, idx, inv = np.unique(
df.Account_Number.values,
return_index=True,
return_inverse=True
)
df.assign(
Account_Number=
df.Dummy_Account.values[idx][inv]
)
Account_Number Dummy_Account
0 ACC0000000000001 ACC0000000000001
1 ACC0000000000002 ACC0000000000002
2 ACC0000000000001 ACC0000000169532
3 ACC0000000123005 ACC0000000123005
4 ACC0000000000004 ACC0000000000004
5 ACC0000000000005 ACC0000000000005
6 ACC0000000123005 ACC0000000000040
选项3
或使用
pd.factorize
和
pd.Series.duplicated
.
与选项2类似,但我让
duplicated
扮演识别第一个值所在位置的角色。然后,我用生成的布尔数组对重合值进行切片,然后用
pd。因式分解
.
f
扮演的角色与
投资部
来自选项2。
d = ~df.Account_Number.duplicated().values
f, u = pd.factorize(df.Account_Number.values)
df.assign(
Account_Number=
df.Dummy_Account.values[d][f]
)
Account_Number Dummy_Account
0 ACC0000000000001 ACC0000000000001
1 ACC0000000000002 ACC0000000000002
2 ACC0000000000001 ACC0000000169532
3 ACC0000000123005 ACC0000000123005
4 ACC0000000000004 ACC0000000000004
5 ACC0000000000005 ACC0000000000005
6 ACC0000000123005 ACC0000000000040
时间测试
后果
res.plot(loglog=True)
res.div(res.min(1), 0)
jez cold pir1 pir2 pir3
10 4.524811 3.819322 2.870916 1.000000 1.238144
30 4.833144 4.093932 3.310285 1.000000 1.382189
100 4.863337 4.048008 3.146154 1.000000 1.320060
300 5.144460 3.894850 3.157636 1.000000 1.357779
1000 4.870499 3.742524 2.348021 1.000000 1.069559
3000 5.375105 3.432398 1.852771 1.126024 1.000000
10000 7.100372 4.335100 1.890134 1.551161 1.000000
30000 7.227139 3.993985 1.530002 1.594531 1.000000
100000 8.052324 3.811728 1.380440 1.708170 1.000000
300000 8.690613 4.204664 1.539624 1.942090 1.000000
1000000 7.787494 3.668117 1.498758 2.129085 1.000000
安装程序
def jez(d):
v = d.sort_values('Account_Number')
v['Account_Number'] = v['Dummy_Account'].mask(v.duplicated('Account_Number')).ffill()
return v.sort_index()
def cold(d):
m = d.drop_duplicates('Account_Number', keep='first')\
.set_index('Account_Number')\
.Dummy_Account
return d.assign(Account_Number=d.Account_Number.map(m))
def pir1(d):
return d.assign(
Account_Number=
d.groupby('Account_Number')
.Dummy_Account
.transform('first')
)
def pir2(d):
u, idx, inv = np.unique(
d.Account_Number.values,
return_index=True,
return_inverse=True
)
return d.assign(
Account_Number=
d.Dummy_Account.values[idx][inv]
)
def pir3(d):
p = ~d.Account_Number.duplicated().values
f, u = pd.factorize(d.Account_Number.values)
return d.assign(
Account_Number=
d.Dummy_Account.values[p][f]
)
res = pd.DataFrame(
index=[10, 30, 100, 300, 1000, 3000, 10000,
30000, 100000, 300000, 1000000],
columns='jez cold pir1 pir2 pir3'.split(),
dtype=float
)
np.random.seed([3, 1415])
for i in res.index:
d = pd.DataFrame(dict(
Account_Number=np.random.randint(i // 2, size=i),
Dummy_Account=range(i)
))
d = pd.concat([df] * i, ignore_index=True)
for j in res.columns:
stmt = f'{j}(d)'
setp = f'from __main__ import {j}, d'
res.at[i, j] = timeit(stmt, setp, number=100)