代码之家 › 专栏 › 技术社区 › Edamame

Pandas:转换和复制Pandas中的某些行

pandas python-3.x

Edamame · 技术社区 · 6 年前

我有一个pandas数据帧,格式如下:

record_id, f_1 , f_2, f_3, ... , f_n,        A,        B,        C
        1,  0.1, 0.2, 0.3, ... , 1.2,        1,        0,        1
        2,  0.3, 1.2, 0.5, ... , 2.1,        1,        0,        0
        3,  0.2, 3.2, 1.3, ... , 0.4,        1,        1,        0
        4,  1.1, 0.1, 0.7, ... , 0.5,        0,        0,        1
        5,  2.1, 0.5, 0.8, ... , 1.9,        0,        1,        1
        6,  0.5, 0.4, 0.2, ... , 0.8,        1,        1,        1
                     :
                     :

1/(total_records_after_duplication) .

record_id, f_1 , f_2, f_3, ... , f_n,    target   weight
        1,  0.1, 0.2, 0.3, ... , 1.2,         A     0.5
        1,  0.1, 0.2, 0.3, ... , 1.2,         C     0.5
        2,  0.3, 1.2, 0.5, ... , 2.1,         A     1.0
        3,  0.2, 3.2, 1.3, ... , 0.4,         A     0.5
        3,  0.2, 3.2, 1.3, ... , 0.4,         B     0.5
        4,  1.1, 0.1, 0.7, ... , 0.5,         C     1.0 
        5,  2.1, 0.5, 0.8, ... , 1.9,         B     0.5
        5,  2.1, 0.5, 0.8, ... , 1.9,         C     0.5
        6,  0.5, 0.4, 0.2, ... , 0.8,         A     0.333
        6,  0.5, 0.4, 0.2, ... , 0.8,         B     0.333
        6,  0.5, 0.4, 0.2, ... , 0.8,         C     0.333   
                         :
                         :

1 回复 | 直到 6 年前

PabTorre 6 年前

IIUC公司 wide_to_long

i=df.columns[
        df.columns 
          .str
          .contains('f_|record_id')
        ].tolist()

newdf=pd.wide_to_long(
        df,
        'output',
        i=i,
        j='Out',
        suffix='\w+')

newdf = (newdf
       .loc[lambda x : x['output']==1]
       .reset_index()
       .rename(columns={'output':'weight'}))

newdf.Out = 'output' + newdf.Out

newdf.weight /= (
       newdf
       .groupby('record_id')['record_id']
       .transform('count'))

newdf
Out[265]: 
    record_id  f_1  f_2  f_3  f_n       Out    weight
0           1  0.1  0.2  0.3  1.2  output_A  0.500000
1           1  0.1  0.2  0.3  1.2  output_C  0.500000
2           2  0.3  1.2  0.5  2.1  output_A  1.000000
3           3  0.2  3.2  1.3  0.4  output_A  0.500000
4           3  0.2  3.2  1.3  0.4  output_B  0.500000
5           4  1.1  0.1  0.7  0.5  output_C  1.000000
6           5  2.1  0.5  0.8  1.9  output_B  0.500000
7           5  2.1  0.5  0.8  1.9  output_C  0.500000
8           6  0.5  0.4  0.2  0.8  output_A  0.333333
9           6  0.5  0.4  0.2  0.8  output_B  0.333333
10          6  0.5  0.4  0.2  0.8  output_C  0.333333

自从你编辑

newdf=df.set_index(df.columns[df.columns.str.contains('f_|record_id')].tolist())
newdf=newdf.stack().reset_index()

newdf=newdf.loc[lambda x : x[0]==1].rename(columns={'level_5':'target',0:'weight'})

所有其他步骤应与上述相同

推荐文章