代码之家  ›  专栏  ›  技术社区  ›  Varun Tahin

同义词分析器无法使用python进行弹性搜索

  •  1
  • Varun Tahin  · 技术社区  · 7 年前

    在本文中,我试图明确地将纽约和纽约定义为同义词。但不幸的是,它不起作用。我是新来的,你能给我指点路吗。 我还有一个文件synonyms.txt,其中包含以下文本: 纽约,纽约,纽约

    from datetime import datetime
    from elasticsearch import Elasticsearch
    
    es = Elasticsearch()
    
    keywords = ['thousand eyes', 'facebook', 'superdoc', 'quora', 'your story', 'Surgery', 'lending club', 'ad roll',
                'the honest company', 'Draft kings', 'newyork']
    count = 1
    
    doc_setting = {
        "settings": {
            "analysis": {
                "analyzer": {
                    "my_analyzer_keyword": {
                        "type": "custom",
                        "tokenizer": "keyword",
                        "filter": [
                            "asciifolding",
                            "lowercase",
                            "synonym"
                        ]
                    },
                    "my_analyzer_shingle": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": [
                            "asciifolding",
                            "lowercase",
                            "synonym"
                        ]
                    }
                },
                "filter": {
                    "synonym": {
                        "type": "synonym",
                        "synonyms_path": "synonyms.txt",
                        "ignore_case": "true"
                    }
                }
            }
        }, "mappings": {
            "your_type": {
                "properties": {
                    "keyword": {
                        "type": "string",
                        "index_analyzer": "my_analyzer_keyword",
                        "search_analyzer": "my_analyzer_shingle"
                    }
                }
            }
        }
    }
    
    validate=es.index(index='test', doc_type='your_type', body=doc_setting)
    
    print(validate)
    
    for keyword in keywords:
        doc = {
            'id': count,
            'keyword': keyword
        }
        res = es.index(index="test", doc_type='your_type', id=count, body=doc)
        print(res['result'])
        count = count + 1
    
    #res11 = es.get(index="test", doc_type='your_type', id=1)
    #print(res11['_source'])
    
    es.indices.refresh(index="test")
    question = "I saw news on ny news channel of lending club on facebook, your story and quora"
    print("Question asked: %s" % question)
    
    res = es.search(index="test",`enter code here` doc_type='your_type', body={
        "query": {"match": {"keyword": question}}})
    print("Got %d Hits:" % res['hits']['total'])
    
    for hit in res['hits']['hits']:
        print(hit["_source"])
    
    1 回复  |  直到 7 年前
        1
  •  2
  •   sramalingam24    7 年前

    放置/测试索引

    {
        "settings": {
            "analysis": {
                "analyzer": {
                    "my_analyzer_keyword": {
                        "type": "custom",
                        "tokenizer": "keyword",
                        "filter": [
                            "asciifolding",
                            "lowercase",
                            "synonym"
                        ]
                    },
                    "my_analyzer_shingle": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": [
                            "asciifolding",
                            "lowercase",
                            "synonym"
                        ]
                    }
                },
                "filter": {
                    "synonym" : {
                            "type" : "synonym",
                            "lenient": true,
                            "synonyms" : ["ny,newyork,nyork"]
                        }
                }
            }
        }, "mappings": {
            "your_type": {
                "properties": {
                    "keyword": {
                        "type": "text",
                        "analyzer": "my_analyzer_keyword",
                        "search_analyzer": "my_analyzer_shingle"
                    }
                }
            }
        }
    }
    

    然后使用

    POST /test_index/_analyze
    {
        "analyzer" : "my_analyzer_shingle",
      "text" : "I saw news on ny news channel of lending club on facebook, your story and quorat"
    }
    

    我得到的代币是

    {
        "tokens": [
            {
                "token": "i",
                "start_offset": 0,
                "end_offset": 1,
                "type": "<ALPHANUM>",
                "position": 0
            },
            {
                "token": "saw",
                "start_offset": 2,
                "end_offset": 5,
                "type": "<ALPHANUM>",
                "position": 1
            },
            {
                "token": "news",
                "start_offset": 6,
                "end_offset": 10,
                "type": "<ALPHANUM>",
                "position": 2
            },
            {
                "token": "on",
                "start_offset": 11,
                "end_offset": 13,
                "type": "<ALPHANUM>",
                "position": 3
            },
            {
                "token": "ny",
                "start_offset": 14,
                "end_offset": 16,
                "type": "<ALPHANUM>",
                "position": 4
            },
            {
                "token": "newyork",
                "start_offset": 14,
                "end_offset": 16,
                "type": "SYNONYM",
                "position": 4
            },
            {
                "token": "nyork",
                "start_offset": 14,
                "end_offset": 16,
                "type": "SYNONYM",
                "position": 4
            },
            {
                "token": "news",
                "start_offset": 17,
                "end_offset": 21,
                "type": "<ALPHANUM>",
                "position": 5
            },
            {
                "token": "channel",
                "start_offset": 22,
                "end_offset": 29,
                "type": "<ALPHANUM>",
                "position": 6
            },
            {
                "token": "of",
                "start_offset": 30,
                "end_offset": 32,
                "type": "<ALPHANUM>",
                "position": 7
            },
            {
                "token": "lending",
                "start_offset": 33,
                "end_offset": 40,
                "type": "<ALPHANUM>",
                "position": 8
            },
            {
                "token": "club",
                "start_offset": 41,
                "end_offset": 45,
                "type": "<ALPHANUM>",
                "position": 9
            },
            {
                "token": "on",
                "start_offset": 46,
                "end_offset": 48,
                "type": "<ALPHANUM>",
                "position": 10
            },
            {
                "token": "facebook",
                "start_offset": 49,
                "end_offset": 57,
                "type": "<ALPHANUM>",
                "position": 11
            },
            {
                "token": "your",
                "start_offset": 59,
                "end_offset": 63,
                "type": "<ALPHANUM>",
                "position": 12
            },
            {
                "token": "story",
                "start_offset": 64,
                "end_offset": 69,
                "type": "<ALPHANUM>",
                "position": 13
            },
            {
                "token": "and",
                "start_offset": 70,
                "end_offset": 73,
                "type": "<ALPHANUM>",
                "position": 14
            },
            {
                "token": "quorat",
                "start_offset": 74,
                "end_offset": 80,
                "type": "<ALPHANUM>",
                "position": 15
            }
        ]
    }
    

    POST /test_index/_search
    {
        "query" : {
            "match" : { "keyword" : "I saw news on ny news channel of lending club on facebook, your story and quora" }
        }
    }
    
    {
        "took": 36,
        "timed_out": false,
        "_shards": {
            "total": 5,
            "successful": 5,
            "skipped": 0,
            "failed": 0
        },
        "hits": {
            "total": 3,
            "max_score": 1.6858001,
            "hits": [
                {
                    "_index": "test_index",
                    "_type": "your_type",
                    "_id": "4",
                    "_score": 1.6858001,
                    "_source": {
                        "keyword": "newyork"
                    }
                },
                {
                    "_index": "test_index",
                    "_type": "your_type",
                    "_id": "2",
                    "_score": 1.1727304,
                    "_source": {
                        "keyword": "facebook"
                    }
                },
                {
                    "_index": "test_index",
                    "_type": "your_type",
                    "_id": "5",
                    "_score": 0.6931472,
                    "_source": {
                        "keyword": "quora"
                    }
                }
            ]
        }
    }