代码之家  ›  专栏  ›  技术社区  ›  Md Riaz

多标签文本分类。我有一个文本/标签csv。文本是纯文本,标签是字母数字

  •  0
  • Md Riaz  · 技术社区  · 7 年前
        import keras
        import keras.backend as K
        from keras.optimizers import Adam
        from keras.models import Sequential
        from keras.layers import Dense
        from keras.layers.core import Activation
        from keras.preprocessing.text import Tokenizer          # for 
        tokenizing text
        from keras.preprocessing.sequence import pad_sequences  # for 
        padding sentences with zeros. To make the sentence length same
        from keras.utils import to_categorical                  # for one- 
        hot encoding of the labels
        from keras.layers import Dense, Input, Flatten, Dropout, 
        BatchNormalization
        from keras.layers import Conv1D, MaxPooling1D, Embedding
        from keras.models import Sequential
        from sklearn.model_selection import train_test_split
    
    
        MAX_SEQUENCE_LENGTH = 300   
        MAX_NB_WORDS = 20000        
    
        #Reading the data
        raw_data=pd.read_csv("/home/riaz.k/Desktop/TRAIN.csv")
        raw_data.head()
    
        # create training and testing vars
        train, test = train_test_split(raw_data, test_size=0.3)
        train.head()
        test.head()
    
        tokenizer = Tokenizer(num_words=MAX_NB_WORDS)   
        tokenizer.fit_on_texts(train.Procedure)           
        train_sequences = tokenizer.texts_to_sequences(train.Procedure)
        test_sequences = tokenizer.texts_to_sequences(test.Procedure)
    
        word_index = tokenizer.word_index                
        containing words and their index
        # print(tokenizer.word_index)                  
        print('Found %s unique tokens.' % len(word_index)) 
        train_data = pad_sequences(train_sequences, 
        maxlen=MAX_SEQUENCE_LENGTH)  
        train
        test_data=pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH) 
        test
    
        print(train_data.shape)
        print(test_data.shape)
        print (word_index)
    
        train_labels = train['dxcode']
        test_labels = test['dxcode']
        from sklearn import preprocessing
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()                  # converts the character 
                                            array to numeric array. 
                                    Assigns levels to unique labels.
        le.fit(train_labels)
        le.fit(test_labels)
        train_labels = le.transform(train_labels)
        test_labels = le.transform(test_labels)
    
        print(le.classes_)
        print(np.unique(train_labels, return_counts=True))
        print(np.unique(test_labels, return_counts=True))
    
        le.inverse_transform(1)
    
        labels_train = to_categorical(np.asanyarray(train_labels))
        labels_test  = to_categorical(np.asarray(test_labels))
        print('Shape of data tensor:', train_data.shape)
        print('Shape of label tensor:', labels_train.shape)
        print('Shape of label tensor:', labels_test.shape)
    
        EMBEDDING_DIM = 100
        print(MAX_SEQUENCE_LENGTH)
    
        print('Training model.')
    
        model = Sequential()
        model.add(Embedding(MAX_NB_WORDS,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH
                            ))
        model.add(Dropout(0.2))
        model.add(Conv1D(128, 5, activation='relu'))
        model.add(MaxPooling1D(5))
        model.add(Dropout(0.5))
        model.add(BatchNormalization())
        model.add(Conv1D(128, 5, activation='relu'))
        model.add(MaxPooling1D(5))
        model.add(Dropout(0.5))
        model.add(BatchNormalization())
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dense(23, activation='softmax'))
    
    
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['acc'],)
    
    
        model.fit(train_data, labels_train,
                  batch_size=32,
                  epochs=10,
                  validation_data=(test_data, labels_test))
    
        model.evaluate(test_data, labels_test)
        pred = model.predict(test_data)
        pred
        # print(model.layers)
        for layer in model.layers:
            print(layer)
    
        import keras.backend as K
        emd = K.function(inputs=[model.layers[0].input], 
                         outputs=[model.layers[0].output])
    
        rbind = np.concatenate((train_data, test_data), axis=0)
        print(rbind.shape)
        ### Submissions file 
        test_results = model.predict_classes(rbind)
        #print(test_results)
        test_labels = le.inverse_transform(test_results)
        #test_labels = [le.inverse_transform(i) for i in test_results] 
        submissions_CNN = 
        pd.DataFrame({'id':raw_data['Claimno'],"label":test_labels})
        submissions_CNN.to_csv("/home/riaz.k/Desktop/submissions.csv",index=False)
    

    文本文档可以使用多个标签进行标记,因此如何在此数据集上进行多标签分类?我已经阅读了sklearn的很多文档,但我似乎找不到正确的方法来进行多标签分类。提前感谢您的帮助。

    1 回复  |  直到 7 年前
        1
  •  0
  •   Vivek Kumar    7 年前

    您是否在这一行中看到错误:

    train_labels = le.transform(train_labels)
    

    如果是,那是因为在它上面的一行中,您正在这样做:

    le.fit(test_labels)
    

    这会忘记以前的数据(以前调用 fit() 并只记住 test_labels 因此,当一个新标签(列车上有但测试中没有)出现时,它会抛出此错误。

    您需要重新保存这些行:

    le.fit(train_labels)
    le.fit(test_labels)
    

    使用此选项:

    # I am using .tolist() because I observe that your 
    # train_labels, test_labels are pandas Series objects
    le.fit(train_labels.tolist() + test_labels.tolist())