代码之家  ›  专栏  ›  技术社区  ›  Saurabh Jain

sklearn管道不工作

  •  0
  • Saurabh Jain  · 技术社区  · 8 年前

    movie review 数据数据包含两列,第一列 class text .

    input_file_df = pd.read_csv("movie-pang.csv")
    x_train = input_file_df["text"] #used complete data as train data
    y_train = input_file_df["class"]
    

    我只使用了一个功能, sentiment score for each sentence. 我为此编写了自定义transformer:

    class GetWorldLevelSentiment(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def get_word_level_sentiment(self, word_list):
        sentiment_score = 1
        for word in word_list:
            word_sentiment = swn.senti_synsets(word)
    
            if len(word_sentiment) > 0:
                word_sentiment = word_sentiment[0]
            else:
                continue
    
            if word_sentiment.pos_score() > word_sentiment.neg_score():
                word_sentiment_score = word_sentiment.pos_score()
            elif word_sentiment.pos_score() < word_sentiment.neg_score():
                word_sentiment_score = word_sentiment.neg_score()*(-1)
            else:
                word_sentiment_score = word_sentiment.pos_score()
    
            print word, " " , word_sentiment_score
            if word_sentiment_score != 0:
                sentiment_score = sentiment_score * word_sentiment_score
    
        return sentiment_score
    
    def transform(self, review_list, y=None):
        sentiment_score_list = list()
        for review in review_list:
            sentiment_score_list.append(self.get_word_level_sentiment(review.split()))
        
        return np.asarray(sentiment_score_list)
    
    def fit(self, x, y=None):
        return self
    

    pipeline = Pipeline([
    ("word_level_sentiment",GetWorldLevelSentiment()),
    ("clf", MultinomialNB())])
    

    pipeline.fit(x_train, y_train)
    

    但这给了我以下错误:

    This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

    2 回复  |  直到 5 年前
        1
  •  0
  •   Saurabh Jain    8 年前

    这对我很有效:

    class GetWorldLevelSentiment(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def get_word_level_sentiment(self, word_list):
        sentiment_score = 1
        for word in word_list:
            word_sentiment = swn.senti_synsets(word)
    
            if len(word_sentiment) > 0:
                word_sentiment = word_sentiment[0]
            else:
                continue
    
            if word_sentiment.pos_score() > word_sentiment.neg_score():
                word_sentiment_score = word_sentiment.pos_score()
            elif word_sentiment.pos_score() < word_sentiment.neg_score():
                word_sentiment_score = word_sentiment.neg_score()*(-1)
            else:
                word_sentiment_score = word_sentiment.pos_score()
    
            print word, " " , word_sentiment_score
            if word_sentiment_score != 0:
                sentiment_score = sentiment_score * word_sentiment_score
    
        return sentiment_score
    
    def transform(self, review_list, y=None):
        sentiment_score_list = list()
        for review in review_list:
            sentiment_score_list.append(self.get_word_level_sentiment(review.split()))
    
        return pandas.DataFrame(sentiment_score-list)
    
    def fit(self, x, y=None):
        return self
    
        2
  •  0
  •   Платформа Игр    3 年前
        from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import cross_val_score
    
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import Pipeline, FeatureUnion
    from sklearn.preprocessing import StandardScaler
    from sklearn import metrics
    import pandas as pd
    
    # Текстовый трансформатор
    class TextTransformer(BaseEstimator, TransformerMixin):
        """
        Преобразование текстовых признаков
        """
        def __init__(self, key):
            self.key = key
    
        def fit(self, X, y=None, *parg, **kwarg):
            return self
    
        def transform(self, X):
            return X[self.key]
    
    # Числовой трансформатор
    class NumberTransformer(BaseEstimator, TransformerMixin):
        """
        Преобразование числовых признаков
        """
        def __init__(self, key):
            self.key = key
    
        def fit(self, X, y=None):
            return self
    
        def transform(self, X):
            return X[[self.key]]
    
    
    
    def fit_predict(model, X_train, X_test, y_train):
        # использовать частотный векторизатор обратной частоты документа, 
        vec_tdidf = CountVectorizer(ngram_range=(2,3), max_df=0.93, min_df=0.05)
    
        #Текстовый признак clean 
        text = Pipeline([
                        ('transformer', TextTransformer(key='clear_text')),
                        ('vectorizer', vec_tdidf)
                        ])
        #Числовой признак word_clean_count
        word_numeric = Pipeline([
                        ('transformer', NumberTransformer(key='word_count'))
                        ])
    
        posting_day = Pipeline([
                        ('transformer', NumberTransformer(key='posting_day'))
                        ])
        posting_month = Pipeline([
                        ('transformer', NumberTransformer(key='posting_month'))
                        ])
        post_theme = Pipeline([
                        ('transformer', NumberTransformer(key='theme'))
                        ])
        # Объединение всех признаков
        features = FeatureUnion([('Text_Feature', text),
                                 ('Num1_Feature', word_numeric),
                                 ('Num3_Feature', posting_day),
                                 ('Num4_Feature', posting_month),
                                 ('Num6_Feature', post_theme)
    
                                ])
    
        # Классификатор
        clf = model
    
        # Объединение классификатора и признаков
        pipe = Pipeline([('features', features),
                         ('clf',clf)
                         ])
    
        # Обучение модели
        pipe_fit=pipe.fit(X_train, y_train)
    
        # Предсказание данных
        preds = pipe_fit.predict(X_test)
    
        return preds, pipe_fit