代码之家  ›  专栏  ›  技术社区  ›  Suhail Gupta

如何使用手套生成向量矩阵?

  •  -1
  • Suhail Gupta  · 技术社区  · 7 年前

    HDBSCAN tf-idf 算法和要使用的 GloVe Gensim 但不明白我怎么能用这个来实现 GloVe

    import numpy as np
    import pandas as pd
    import nltk
    import re
    import os
    import codecs
    from sklearn import feature_extraction
    import mpld3
    import csv
    import string
    import time
    import sys
    import matplotlib.pyplot as plt
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.externals import joblib
    from nltk.stem.snowball import SnowballStemmer
    from sklearn.feature_extraction.text import TfidfVectorizer
    import hdbscan
    
    csvRows = []
    nltk.download('stopwords')
    
    title = []
    synopses = []
    filename = "twitter-test-dataset.csv"
    num_clusters = 10
    pkl_file = "doc_cluster.pkl"
    generate_pkl = False
    
    # pre-process data
    with open(filename, 'r') as csvfile:
        # creating a csv reader object
        csvreader = csv.reader(csvfile)
    
        # extracting field names through first row
        fields = csvreader.next()
    
        # extracting each data row one by one
        duplicates = 0
        for row in csvreader:
            # removes the characters specified
            line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
            line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
            line = re.sub(r'https?:\/\/.*[\r\n]*', '',
                        line, flags=re.MULTILINE)  # remove link
            line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
            line = (re.sub(
                "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
            line = filter(lambda x: x in string.printable,
                        line)  # filter non-ascii characers
            if line not in synopses:
                synopses.append(line)
                title.append(row[2])
            else:
                duplicates += 1
    
    print("Removed " + str(duplicates) + " rows")
    
    
    stopwords = nltk.corpus.stopwords.words('english')
    stemmer = SnowballStemmer("english")
    
    
    def tokenize_and_stem(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word for sent in nltk.sent_tokenize(
            text) for word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
        stems = [stemmer.stem(t) for t in filtered_tokens]
        return stems
    
    
    def tokenize_only(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word.lower() for sent in nltk.sent_tokenize(text)
                for word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
        return filtered_tokens
    
    
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    
    for i in synopses:
        # for each item in 'synopses', tokenize/stem
        allwords_stemmed = tokenize_and_stem(i)
        # extend the 'totalvocab_stemmed' list
        totalvocab_stemmed.extend(allwords_stemmed)
    
        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)
    
    vocab_frame = pd.DataFrame(
        {'words': totalvocab_tokenized}, index=totalvocab_stemmed)
    
    # print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"
    
    
    # define vectorizer parameters
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                    min_df=0.0, stop_words='english',
                                    use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
    
    #CREATE TFIDF MATRIX
    tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
    terms = tfidf_vectorizer.get_feature_names()
    
    
    c = hdbscan.HDBSCAN(min_cluster_size=5)
    #PASS TFIDF_MATRIX TO HDBSCAN
    c.fit(tfidf_matrix)
    print(c.labels_)
    sys.exit()
    

    HDBSCAN tf-idf 用于文本聚类。我怎么用 代替 tf idf公司 ?

    1 回复  |  直到 7 年前
        1
  •  0
  •   Has QUIT--Anony-Mousse    7 年前

    通常的做法似乎是使用文档中每个单词的所有手套向量的平均值。

    我不相信这一点。理论上的支持似乎很脆弱,你可以加上或平均这些向量,因为这会破坏角度。