from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pandas as pd
# ТекÑÑовÑй ÑÑанÑÑоÑмаÑоÑ
class TextTransformer(BaseEstimator, TransformerMixin):
"""
ÐÑеобÑазование ÑекÑÑовÑÑ
пÑизнаков
"""
def __init__(self, key):
self.key = key
def fit(self, X, y=None, *parg, **kwarg):
return self
def transform(self, X):
return X[self.key]
# ЧиÑловой ÑÑанÑÑоÑмаÑоÑ
class NumberTransformer(BaseEstimator, TransformerMixin):
"""
ÐÑеобÑазование ÑиÑловÑÑ
пÑизнаков
"""
def __init__(self, key):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, X):
return X[[self.key]]
def fit_predict(model, X_train, X_test, y_train):
# иÑполÑзоваÑÑ ÑаÑÑоÑнÑй векÑоÑизаÑÐ¾Ñ Ð¾Ð±ÑаÑной ÑаÑÑоÑÑ Ð´Ð¾ÐºÑменÑа,
vec_tdidf = CountVectorizer(ngram_range=(2,3), max_df=0.93, min_df=0.05)
#ТекÑÑовÑй пÑизнак clean
text = Pipeline([
('transformer', TextTransformer(key='clear_text')),
('vectorizer', vec_tdidf)
])
#ЧиÑловой пÑизнак word_clean_count
word_numeric = Pipeline([
('transformer', NumberTransformer(key='word_count'))
])
posting_day = Pipeline([
('transformer', NumberTransformer(key='posting_day'))
])
posting_month = Pipeline([
('transformer', NumberTransformer(key='posting_month'))
])
post_theme = Pipeline([
('transformer', NumberTransformer(key='theme'))
])
# ÐбÑединение вÑеÑ
пÑизнаков
features = FeatureUnion([('Text_Feature', text),
('Num1_Feature', word_numeric),
('Num3_Feature', posting_day),
('Num4_Feature', posting_month),
('Num6_Feature', post_theme)
])
# ÐлаÑÑиÑикаÑоÑ
clf = model
# ÐбÑединение клаÑÑиÑикаÑоÑа и пÑизнаков
pipe = Pipeline([('features', features),
('clf',clf)
])
# ÐбÑÑение модели
pipe_fit=pipe.fit(X_train, y_train)
# ÐÑедÑказание даннÑÑ
preds = pipe_fit.predict(X_test)
return preds, pipe_fit