我很早以前就向Lucene提交了一个bug,正在寻找这个问题的答案。但是很多时间过去了,甚至连分析器的开发人员似乎都不愿意回答我的问题,所以我想我会把它扔到地板上,看看是否有其他人能解释这里发生了什么。
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
public class LuceneMissingTerms {
public static void main(String[] args) throws Exception {
try (Directory directory = new RAMDirectory()) {
Analyzer analyser = new JapaneseAnalyzer();
try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyser))) {
Document document = new Document();
document.add(new TextField("content", "blah blah commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO));
writer.addDocument(document);
}
try (IndexReader multiReader = DirectoryReader.open(directory)) {
for (LeafReaderContext leaf : multiReader.leaves()) {
LeafReader reader = leaf.reader();
Terms terms = MultiFields.getFields(reader).terms("content");
TermsEnum termsEnum = terms.iterator();
BytesRef text;
//noinspection NestedAssignment
while ((text = termsEnum.next()) != null) {
System.out.println("Term in index: " + text.utf8ToString());
}
}
StandardQueryParser queryParser = new StandardQueryParser(analyser);
queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND);
String queryString = "\"\u79CB\u8449\u539F\"";
// quoted to work around strange behaviour of StandardQueryParser treating this as a boolean query.
Query query = queryParser.parse(queryString, "content");
System.out.println("Performing query: " + queryString);
TopDocs topDocs = new IndexSearcher(multiReader).search(query, 10);
System.out.println("Hits count: " + topDocs.totalHits);
}
}
}
}
运行此命令时,输出如下:
Term in index: blah
Term in index: commercial
Term in index: ç§èå
Performing query: "ç§èå"
Hit count: 0
因此,我们在索引中有一个术语,查询该术语时没有找到它。通常在这种情况下,索引时使用的分析器与查询时使用的分析仪不同,但在上面的示例中,两者都使用了相同的分析器对象。
所以我假设分析器中有一个bug,它与单词周围的上下文有关,因为这是两种情况下唯一不同的地方,但我不清楚实际问题是什么,也不知道我将如何解决它。
另一方面,也许这是预期的行为?如果是这样的话,那么我也可以关闭我的罚单,最终用户可能会有点生气,因为我们在这样关闭问题之前,已经在这个问题上坐了几年。