代码之家 › 专栏 › 技术社区 › Varun Tahin

同义词分析器无法使用python进行弹性搜索

elasticsearch python-3.x python

Varun Tahin · 技术社区 · 7 年前

在本文中,我试图明确地将纽约和纽约定义为同义词。但不幸的是,它不起作用。我是新来的,你能给我指点路吗。我还有一个文件synonyms.txt,其中包含以下文本: 纽约,纽约,纽约

from datetime import datetime
from elasticsearch import Elasticsearch

es = Elasticsearch()

keywords = ['thousand eyes', 'facebook', 'superdoc', 'quora', 'your story', 'Surgery', 'lending club', 'ad roll',
            'the honest company', 'Draft kings', 'newyork']
count = 1

doc_setting = {
    "settings": {
        "analysis": {
            "analyzer": {
                "my_analyzer_keyword": {
                    "type": "custom",
                    "tokenizer": "keyword",
                    "filter": [
                        "asciifolding",
                        "lowercase",
                        "synonym"
                    ]
                },
                "my_analyzer_shingle": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "asciifolding",
                        "lowercase",
                        "synonym"
                    ]
                }
            },
            "filter": {
                "synonym": {
                    "type": "synonym",
                    "synonyms_path": "synonyms.txt",
                    "ignore_case": "true"
                }
            }
        }
    }, "mappings": {
        "your_type": {
            "properties": {
                "keyword": {
                    "type": "string",
                    "index_analyzer": "my_analyzer_keyword",
                    "search_analyzer": "my_analyzer_shingle"
                }
            }
        }
    }
}

validate=es.index(index='test', doc_type='your_type', body=doc_setting)

print(validate)

for keyword in keywords:
    doc = {
        'id': count,
        'keyword': keyword
    }
    res = es.index(index="test", doc_type='your_type', id=count, body=doc)
    print(res['result'])
    count = count + 1

#res11 = es.get(index="test", doc_type='your_type', id=1)
#print(res11['_source'])

es.indices.refresh(index="test")
question = "I saw news on ny news channel of lending club on facebook, your story and quora"
print("Question asked: %s" % question)

res = es.search(index="test",`enter code here` doc_type='your_type', body={
    "query": {"match": {"keyword": question}}})
print("Got %d Hits:" % res['hits']['total'])

for hit in res['hits']['hits']:
    print(hit["_source"])

1 回复 | 直到 7 年前

sramalingam24 7 年前

放置/测试索引

{
    "settings": {
        "analysis": {
            "analyzer": {
                "my_analyzer_keyword": {
                    "type": "custom",
                    "tokenizer": "keyword",
                    "filter": [
                        "asciifolding",
                        "lowercase",
                        "synonym"
                    ]
                },
                "my_analyzer_shingle": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "asciifolding",
                        "lowercase",
                        "synonym"
                    ]
                }
            },
            "filter": {
                "synonym" : {
                        "type" : "synonym",
                        "lenient": true,
                        "synonyms" : ["ny,newyork,nyork"]
                    }
            }
        }
    }, "mappings": {
        "your_type": {
            "properties": {
                "keyword": {
                    "type": "text",
                    "analyzer": "my_analyzer_keyword",
                    "search_analyzer": "my_analyzer_shingle"
                }
            }
        }
    }
}

然后使用

POST /test_index/_analyze
{
    "analyzer" : "my_analyzer_shingle",
  "text" : "I saw news on ny news channel of lending club on facebook, your story and quorat"
}

我得到的代币是

{
    "tokens": [
        {
            "token": "i",
            "start_offset": 0,
            "end_offset": 1,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "saw",
            "start_offset": 2,
            "end_offset": 5,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "news",
            "start_offset": 6,
            "end_offset": 10,
            "type": "<ALPHANUM>",
            "position": 2
        },
        {
            "token": "on",
            "start_offset": 11,
            "end_offset": 13,
            "type": "<ALPHANUM>",
            "position": 3
        },
        {
            "token": "ny",
            "start_offset": 14,
            "end_offset": 16,
            "type": "<ALPHANUM>",
            "position": 4
        },
        {
            "token": "newyork",
            "start_offset": 14,
            "end_offset": 16,
            "type": "SYNONYM",
            "position": 4
        },
        {
            "token": "nyork",
            "start_offset": 14,
            "end_offset": 16,
            "type": "SYNONYM",
            "position": 4
        },
        {
            "token": "news",
            "start_offset": 17,
            "end_offset": 21,
            "type": "<ALPHANUM>",
            "position": 5
        },
        {
            "token": "channel",
            "start_offset": 22,
            "end_offset": 29,
            "type": "<ALPHANUM>",
            "position": 6
        },
        {
            "token": "of",
            "start_offset": 30,
            "end_offset": 32,
            "type": "<ALPHANUM>",
            "position": 7
        },
        {
            "token": "lending",
            "start_offset": 33,
            "end_offset": 40,
            "type": "<ALPHANUM>",
            "position": 8
        },
        {
            "token": "club",
            "start_offset": 41,
            "end_offset": 45,
            "type": "<ALPHANUM>",
            "position": 9
        },
        {
            "token": "on",
            "start_offset": 46,
            "end_offset": 48,
            "type": "<ALPHANUM>",
            "position": 10
        },
        {
            "token": "facebook",
            "start_offset": 49,
            "end_offset": 57,
            "type": "<ALPHANUM>",
            "position": 11
        },
        {
            "token": "your",
            "start_offset": 59,
            "end_offset": 63,
            "type": "<ALPHANUM>",
            "position": 12
        },
        {
            "token": "story",
            "start_offset": 64,
            "end_offset": 69,
            "type": "<ALPHANUM>",
            "position": 13
        },
        {
            "token": "and",
            "start_offset": 70,
            "end_offset": 73,
            "type": "<ALPHANUM>",
            "position": 14
        },
        {
            "token": "quorat",
            "start_offset": 74,
            "end_offset": 80,
            "type": "<ALPHANUM>",
            "position": 15
        }
    ]
}

POST /test_index/_search
{
    "query" : {
        "match" : { "keyword" : "I saw news on ny news channel of lending club on facebook, your story and quora" }
    }
}

{
    "took": 36,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": 3,
        "max_score": 1.6858001,
        "hits": [
            {
                "_index": "test_index",
                "_type": "your_type",
                "_id": "4",
                "_score": 1.6858001,
                "_source": {
                    "keyword": "newyork"
                }
            },
            {
                "_index": "test_index",
                "_type": "your_type",
                "_id": "2",
                "_score": 1.1727304,
                "_source": {
                    "keyword": "facebook"
                }
            },
            {
                "_index": "test_index",
                "_type": "your_type",
                "_id": "5",
                "_score": 0.6931472,
                "_source": {
                    "keyword": "quora"
                }
            }
        ]
    }
}