在下面的数据中,它具有异构数据类型(数字、分类和字符串列表)作为特性。预测变量是
DISEASE
.
我在用
CountVectorizer
为了转换字符串列表,我创建了一个类
RavelTransformer
它将二维列展为一维格式
计数矢量器
可以用。
这是一个工作示例:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
class RavelTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X.ravel()
data = {
'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'jack', 'justo'],
'SUBSTRING_1L': ['j, a, c, k', 'j, u, s, t, e', 'a, n', '', 'j, a, c, k', 'g, i, l', 'j, a, c, k', 'j, u, s, t, o'],
'SUBSTRING_4L': ['jack', 'just, uste', '', '', 'jack', '', 'jack', 'just, usto'],
'SUBSTRING_5L': ['', 'juste', '', '', '', '', '', 'justo'],
'DISEASE': ['cancer', 'healthy', 'healthy', 'healthy', 'cancer', 'healthy', 'cancer', 'heart'],
}
df = pd.DataFrame(data)
x_train, x_test, y_train, y_test = train_test_split(
df[['AGE', 'NAME', 'URBAN','SUBSTRING_4L']], df['DISEASE'], test_size=0.50, random_state=3)
transformer_num = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
transformer_cat = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])
transformer_ngram = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('ravel', RavelTransformer()),
('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None,
max_features=5000))])
preprocessor = ColumnTransformer(
transformers=[
('num', transformer_num, ['AGE']),
('cat', transformer_cat, ['NAME', 'URBAN']),
('ngram', transformer_ngram, ['SUBSTRING_4L']),
])
ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)
model = make_pipeline(preprocessor, ml_algo)
model.fit(x_train, y_train)
print(x_train, '\n'*2, y_train)
print()
print(x_test, '\n'*2, y_test)
print()
print('Model score (on training set): %.3f' % model.score(x_train, y_train))
print('Model score (on test set): %.3f' % model.score(x_test, y_test))
SUBSTRING_4L
和
SUBSTRING_5L
如果仅将其中一个作为功能包含,则可以自行工作。
但当我试着把它们都包括进来时:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
class RavelTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X.ravel()
data = {
'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'jack', 'justo'],
'SUBSTRING_1L': ['j, a, c, k', 'j, u, s, t, e', 'a, n', '', 'j, a, c, k', 'g, i, l', 'j, a, c, k', 'j, u, s, t, o'],
'SUBSTRING_4L': ['jack', 'just, uste', '', '', 'jack', '', 'jack', 'just, usto'],
'SUBSTRING_5L': ['', 'juste', '', '', '', '', '', 'justo'],
'DISEASE': ['cancer', 'healthy', 'healthy', 'healthy', 'cancer', 'healthy', 'cancer', 'heart'],
}
df = pd.DataFrame(data)
x_train, x_test, y_train, y_test = train_test_split(
df[['AGE', 'NAME', 'URBAN','SUBSTRING_4L', 'SUBSTRING_5L']], df['DISEASE'], test_size=0.50, random_state=3)
transformer_num = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
transformer_cat = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])
transformer_ngram = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('ravel', RavelTransformer()),
('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None,
max_features=5000))])
preprocessor = ColumnTransformer(
transformers=[
('num', transformer_num, ['AGE']),
('cat', transformer_cat, ['NAME', 'URBAN']),
('ngram', transformer_ngram, ['SUBSTRING_4L', 'SUBSTRING_5L']),
])
ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)
model = make_pipeline(preprocessor, ml_algo)
model.fit(x_train, y_train)
我知道这个错误
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 8, expected 4.
.
编辑
我修改了一下代码
子串
和
子串
单独处理,但仍在同一管道内。这没有任何错误。
preprocessor = ColumnTransformer(
transformers=[
('num', transformer_num, ['AGE']),
('cat', transformer_cat, ['NAME', 'URBAN']),
('ngram_4l', transformer_ngram, ['SUBSTRING_4L']),
('ngram_5l', transformer_ngram, ['SUBSTRING_5L']),
])
但有人能帮我确认一下吗?或者告诉我如何确认这是无错误的?