代码之家  ›  专栏  ›  技术社区  ›  Alan Mills

Python机器学习训练的分类器错误指数超出界限

  •  0
  • Alan Mills  · 技术社区  · 8 年前

    我有一个训练有素的分类器

    我试图修改它以使用循环处理多个.csv文件,但这已经破坏了它,以至于原始代码(工作正常)现在返回与以前处理的.csv文件相同的错误,没有任何问题。

    我很困惑,不知道是什么突然导致了这个错误,当一切都正常工作之前。原始(工作)代码为;

        # -*- coding: utf-8 -*-
    
        import csv
        import pandas
        import numpy as np
        import sklearn.ensemble as ske
        import re
        import os
        import collections
        import pickle
        from sklearn.externals import joblib
        from sklearn import model_selection, tree, linear_model, svm
    
    
        # Load dataset
        url = 'test_6_During_100.csv'
        dataset = pandas.read_csv(url)
        dataset.set_index('Name', inplace = True)
        ##dataset = dataset[['ProcessorAffinity','ProductVersion','Handle','Company',
        ##            'UserProcessorTime','Path','Product','Description',]]
    
        # Open file to output everything to
        new_url = re.sub('\.csv$', '', url)
        f = open(new_url + " output report", 'w')
        f.write(new_url + " output report\n")
        f.write("\n")
    
    
        # shape
        print(dataset.shape)
        print("\n")
        f.write("Dataset shape " + str(dataset.shape) + "\n")
        f.write("\n")
    
        clf = joblib.load(os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                'classifier/classifier.pkl'))
    
    
        Class_0 = []
        Class_1 = []
        prob = []
    
        for index, row in dataset.iterrows():
            res = clf.predict([row])
            if res == 0:
                if index in malware:
                    Class_0.append(index)
                elif index in Class_1:
                    Class_1.append(index)           
                else:
                    print "Is ", index, " recognised?"
                    designation = raw_input()
    
                    if designation == "No":
                        Class_0.append(index)
                    else:
                        Class_1.append(index)
    
        dataset['Type']  = 1                    
        dataset.loc[dataset.index.str.contains('|'.join(Class_0)), 'Type'] = 0
    
        print "\n"
    
        results = []
    
        results.append(collections.OrderedDict.fromkeys(dataset.index[dataset['Type'] == 0]))
        print (results)
    
        X = dataset.drop(['Type'], axis=1).values
        Y = dataset['Type'].values
    
    
        clf.set_params(n_estimators = len(clf.estimators_) + 40, warm_start = True)
        clf.fit(X, Y)
        joblib.dump(clf, 'classifier/classifier.pkl')
    
        output = collections.Counter(Class_0)
    
        print "Class_0; \n"
        f.write ("Class_0; \n")
    
        for key, value in output.items():    
            f.write(str(key) + " ; " + str(value) + "\n")
            print(str(key) + " ; " + str(value))
    
        print "\n"
        f.write ("\n") 
    
        output_1 = collections.Counter(Class_1)
    
        print "Class_1; \n"
        f.write ("Class_1; \n")
    
        for key, value in output_1.items():    
            f.write(str(key) + " ; " + str(value) + "\n")
            print(str(key) + " ; " + str(value))
    
        print "\n" 
    
        f.close()
    

    # -*- coding: utf-8 -*-
    
    import csv
    import pandas
    import numpy as np
    import sklearn.ensemble as ske
    import re
    import os
    import time
    import collections
    import pickle
    from sklearn.externals import joblib
    from sklearn import model_selection, tree, linear_model, svm
    
    # Our arrays which we'll store our process details in and then later print out data for
    Class_0 = []
    Class_1 = []
    prob = []
    results = []
    
    # Open file to output our report too
    timestr = time.strftime("%Y%m%d%H%M%S")
    
    f = open(timestr + " output report.txt", 'w')
    f.write(timestr + " output report\n")
    f.write("\n")
    
    count = len(os.listdir('.'))
    
    while (count > 0):
        # Load dataset
        for filename in os.listdir('.'):
                if filename.endswith('.csv') and filename.startswith("processes_"):
    
                    url = filename
    
                    dataset = pandas.read_csv(url)
                    dataset.set_index('Name', inplace = True)
    
                    clf = joblib.load(os.path.join(
                            os.path.dirname(os.path.realpath(__file__)),
                            'classifier/classifier.pkl'))               
    
                    for index, row in dataset.iterrows():
                        res = clf.predict([row])
                        if res == 0:
                            if index in Class_0:
                                Class_0.append(index)
                            elif index in Class_1:
                                Class_1.append(index)           
                            else:
                                print "Is ", index, " recognised?"
                                designation = raw_input()
    
                                if designation == "No":
                                    Class_0.append(index)
                                else:
                                    Class_1.append(index)
    
                    dataset['Type']  = 1                    
                    dataset.loc[dataset.index.str.contains('|'.join(Class_0)), 'Type'] = 0
    
                    print "\n"
    
                    results.append(collections.OrderedDict.fromkeys(dataset.index[dataset['Type'] == 0]))
                    print (results)
    
                    X = dataset.drop(['Type'], axis=1).values
                    Y = dataset['Type'].values
    
    
                    clf.set_params(n_estimators = len(clf.estimators_) + 40, warm_start = True)
                    clf.fit(X, Y)
                    joblib.dump(clf, 'classifier/classifier.pkl')
    
                    os.remove(filename) 
    
    
    output = collections.Counter(Class_0)
    
    print "Class_0; \n"
    f.write ("Class_0; \n")
    
    for key, value in output.items():    
        f.write(str(key) + " ; " + str(value) + "\n")
        print(str(key) + " ; " + str(value))
    
    print "\n"
    f.write ("\n") 
    
    output_1 = collections.Counter(Class_1)
    
    print "Class_1; \n"
    f.write ("Class_1; \n")
    
    for key, value in output_1.items():    
        f.write(str(key) + " ; " + str(value) + "\n")
        print(str(key) + " ; " + str(value))
    
    print "\n" 
    
    f.close()
    

    错误( IndexError: index 1 is out of bounds for size 1 res = clf.predict([row]) .据我所知,问题是没有足够的“类”或数据标签类型(我要的是二进制分类器)?但是我以前一直在使用这种精确的方法(在嵌套循环之外),没有任何问题。

    https://codeshare.io/Gkpb44 -包含上述.csv文件的.csv数据的代码共享链接。

    2 回复  |  直到 8 年前
        1
  •  0
  •   Cary Shindell    8 年前

    问题是 [row] res = clf.predict(row) 或者再看看row变量。希望这有帮助。

        2
  •  0
  •   Alan Mills    8 年前

    所以我意识到了问题所在。

    现在,我已经评论了以下内容;

    clf.set_params(n_estimators = len(clf.estimators_) + 40, warm_start = True)
    clf.fit(X, Y)
    joblib.dump(clf, 'classifier/classifier.pkl')
    

    解决了这个问题。接下来,我可能会添加(另一个!)条件语句,以查看是否应重新拟合数据。