代码之家  ›  专栏  ›  技术社区  ›  Michael Yadidya Karthik Sekaran

将字符串转换为唯一的数值Pandas或Numpy

  •  0
  • Michael Yadidya Karthik Sekaran  · 技术社区  · 7 年前

    我是Python的初学者,学习深入。这对你们大多数人来说可能很容易,但我该怎么做呢?

    如何将以下对象转换为唯一的数值?

    df['city'].unique()
    
    array(['LIMA', 'VACAVILLE', 'CINCINNATI', 'GLASGOW', 'BOWLING GREEN',
       'LANCASTER', 'HOUSTON', 'SPRINGFIELD', 'RAPID CITY', 'FORT WORTH',
       'LAREDO', 'NEW YORK', 'CHARLESTON', 'PITTSBURGH',
       'WEST VALLEY CITY', 'CAYCE', 'HOT SPRINGS NATIO', 'CANTON',
       'FORT WAYNE', 'DU BOIS', 'DAYTON', 'MASON CITY', 'WASHINGTON',
       'LAKE OSWEGO', 'FAYETTEVILLE', 'SALT LAKE CITY', 'KNOXVILLE',
       'TURLOCK', 'MCALLEN', 'CENTERVILLE', 'ROCHESTER', 'OKLAHOMA CITY',
       'GAUTIER', 'DOYLESTOWN', 'ATLANTA', 'MEADVILLE', 'FORT MYERS',
       'ERIE', 'BEAUMONT', 'JACKSON', 'CLARKSVILLE', 'BETHLEHEM',
       'SAN ANTONIO', 'LAS VEGAS', 'ATHENS', 'SAN LUIS OBISPO', 'SEATTLE',
       'BRADENTON', 'TINLEY PARK', 'HUNTLEY', 'SYRACUSE', 'WHEELWRIGHT',
       'TOWSON', 'YONKERS', 'ARDEN HILLS', 'MARION', 'LIVONIA',
       'COLORADO SPRINGS', 'CURWENSVILLE', 'SAINT CHARLES', 'PETERSBURG',
       'SCOTTSDALE', 'SILVER SPRING', 'PORTLAND', 'BIRMINGHAM',
       'CEDARVILLE', 'CLERMONT', 'ASHEVILLE', 'SHREVEPORT', 'DRAPER',
       'WAVERLY', 'CANANDAIGUA', 'MOUNT PLEASANT', 'MARIETTA', 'MANKATO',
       'HARLINGEN', 'HATCH', 'MOBILE', 'POULSBO', 'GARDEN GROVE',
       'GIG HARBOR', 'OCONOMOWOC', 'MOUNT MORRIS', 'ORLANDO',
       'DODGE CITY', 'DILLSBURG', 'HUNTSVILLE', 'KANSAS CITY',
       'JACKSONVILLE', 'DULUTH', 'CITRUS HEIGHTS', 'ONEONTA', 'LOS LUNAS',
       'GIBSONIA', 'ROBINSON', 'VERNON HILLS', 'PHOENIX', 'DESTIN',
       'SHEPHERD', 'BROOKLYN', 'PLANO', 'WINTERS', 'JAMAICA', 'POWAY',
       'LEXINGTON', 'UPLAND', 'NEW ALBANY', 'GREENVILLE',
       'JEFFERSON CITY', 'ARLINGTON', 'BUFFALO', 'LOS ANGELES',
       'CHARLOTTE', 'WEST LAFAYETTE', 'GARY', 'COOPERSTOWN', 'GREAT BEND',
       'DAVISON', 'SMYRNA', 'MISSOURI CITY', 'MEMPHIS',
       'FORT WALTON BEACH', 'KISSIMMEE', 'BATAVIA', 'OLDSMAR', 'WYNNE',
       'ASHVILLE', 'FT BRAGG', 'TROY', 'SHAKER HTS', 'CLEVELAND HTS',
       'HAMBURG'], dtype=object)
    

    我正在尝试使用这些数据训练一个模型。

    2 回复  |  直到 7 年前
        1
  •  1
  •   jezrael    7 年前

    IIUC需求 factorize :

    df = pd.DataFrame({'city':list('abcddf')})
    df['city1'] = pd.factorize(df['city'])[0]
    

    或转换为 categorical s和get codes :

    df['city'] = pd.Categorical(df['city'])
    df['city1'] = df['city'].cat.codes
    

    print (df)
    
      city  city1
    0    a      0
    1    b      1
    2    c      2
    3    d      3
    4    d      3
    5    f      4
    
        2
  •  0
  •   pe-perry    7 年前

    您也可以尝试 sklearn.preprocessing.LabelEncoder 。如文档中所述,它使用0到n\u class-1之间的值对标签进行编码。

    from sklearn.preprocessing import LabelEncoder
    
    le = LabelEncoder()
    df['city_num'] = le.fit_transform(df['city'])
    
    print(df.head())
    #             city  city_num
    # 0           LIMA        72
    # 1      VACAVILLE       122
    # 2     CINCINNATI        21
    # 3        GLASGOW        50
    # 4  BOWLING GREEN        10
    print(len(df.city.unique()))
    # 132
    print(len(set(df.city_num)))
    # 132
    

    然后可以将数字列转换为指示器列

    from sklearn.preprocessing import OneHotEncoder
    
    ohe = OneHotEncoder()
    city_ind = ohe.fit_transform(df.city_num.values.reshape(-1, 1))
    
    print(type(city_ind))
    # <class 'scipy.sparse.csr.csr_matrix'>
    
    print(city_ind.shape)
    # (132, 132)
    
    print(city_ind[0:2, ].toarray())
    # [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.]
    #  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
    #    0.  0.  0.  0.  0.  0.]]