IV. Span Query
SpanQuery是Lucene1.4中新添加的一个比较有用的Query.她可以保存更多有用的信息.有如下几个子类
|
SpanQuery type |
Description |
|
SpanTermQuery |
Used in conjunction with the other span query types. On its own, it抯 functionally equivalent to TermQuery. |
|
SpanFirstQuery |
Matches spans that occur within the first part of a field. |
|
SpanNearQuery |
Matches spans that occur near one another. |
|
SpanNotQuery |
Matches spans that don't overlap one another. |
|
SpanOrQuery |
Aggregates matches of span queries. |
看个例子,
001 package lia.advsearching;
002
003 import junit.framework.TestCase;
004 import lia.analysis.AnalyzerUtils;
005 import org.apache.lucene.analysis.Analyzer;
006 import org.apache.lucene.analysis.Token;
007 import org.apache.lucene.analysis.WhitespaceAnalyzer;
008 import org.apache.lucene.document.Document;
009 import org.apache.lucene.document.Field;
010 import org.apache.lucene.index.IndexReader;
011 import org.apache.lucene.index.IndexWriter;
012 import org.apache.lucene.index.Term;
013 import org.apache.lucene.search.Hits;
014 import org.apache.lucene.search.IndexSearcher;
015 import org.apache.lucene.search.PhraseQuery;
016 import org.apache.lucene.search.Query;
017 import org.apache.lucene.search.spans.SpanFirstQuery;
018 import org.apache.lucene.search.spans.SpanNearQuery;
019 import org.apache.lucene.search.spans.SpanNotQuery;
020 import org.apache.lucene.search.spans.SpanOrQuery;
021 import org.apache.lucene.search.spans.SpanQuery;
022 import org.apache.lucene.search.spans.SpanTermQuery;
023 import org.apache.lucene.search.spans.Spans;
024 import org.apache.lucene.store.RAMDirectory;
025
026 import java.io.IOException;
027
028 public class SpanQueryTest extends TestCase {
029 private RAMDirectory directory;
030 private IndexSearcher searcher;
031 private IndexReader reader;
032
033 private SpanTermQuery quick;
034 private SpanTermQuery brown;
035 private SpanTermQuery red;
036 private SpanTermQuery fox;
037 private SpanTermQuery lazy;
038 private SpanTermQuery sleepy;
039 private SpanTermQuery dog;
040 private SpanTermQuery cat;
041 private Analyzer analyzer;
042
043 protected void setUp() throws Exception {
044 directory = new RAMDirectory();
045
046 analyzer = new WhitespaceAnalyzer();
047 IndexWriter writer = new IndexWriter(directory,
048 analyzer, true);
049
050 Document doc = new Document();
051 doc.add(Field.Text("f",
052 "the quick brown fox jumps over the lazy dog")); // 添加doc1
053 writer.addDocument(doc);
054
055 doc = new Document();
056 doc.add(Field.Text("f",
057 "the quick red fox jumps over the sleepy cat"));// 添加toc2
058 writer.addDocument(doc);
059
060 writer.close();
061
062 searcher = new IndexSearcher(directory);
063 reader = IndexReader.open(directory);
064
065 quick = new SpanTermQuery(new Term("f", "quick")); //构造SpanTermQuery 该类是其他几个类的基础
066 brown = new SpanTermQuery(new Term("f", "brown"));
067 red = new SpanTermQuery(new Term("f", "red"));
068 fox = new SpanTermQuery(new Term("f", "fox"));
069 lazy = new SpanTermQuery(new Term("f", "lazy"));
070 sleepy = new SpanTermQuery(new Term("f", "sleepy"));
071 dog = new SpanTermQuery(new Term("f", "dog"));
072 cat = new SpanTermQuery(new Term("f", "cat"));
073 }
074 // 下面是3个帮助测试的函数
075 private void assertOnlyBrownFox(Query query)throws Exception {
076 Hits hits = searcher.search(query);
077 assertEquals(1, hits.length());
078 assertEquals("wrong doc", 0, hits.id(0));
079 }
080
081 private void assertBothFoxes(Query query) throws Exception {
082 Hits hits = searcher.search(query);
083 assertEquals(2, hits.length());
084 }
085
086 private void assertNoMatches(Query query) throws Exception {
087 Hits hits = searcher.search(query);
088 assertEquals(0, hits.length());
089 }
090
091 public void testSpanTermQuery() throws Exception { // 单个TermQuery在功能上和TermQuery相似
092 assertOnlyBrownFox(brown);
093 dumpSpans(brown); /// 参考结果(1)
094 }
095
096 public void testSpanFirstQuery() throws Exception { // 在给定的范围搜索
097 SpanFirstQuery sfq = new SpanFirstQuery(brown, 2); // 前两个 "the quick brown fox jumps over the lazy dog"
098 assertNoMatches(sfq);
099
100 dumpSpans(sfq);
101
102 sfq = new SpanFirstQuery(brown, 3); // 前3个 "the quick brown fox jumps over the lazy dog"
103 dumpSpans(sfq);
104 assertOnlyBrownFox(sfq);
105 }
106
107 public void testSpanNearQuery() throws Exception {
108 SpanQuery[] quick_brown_dog =
109 new SpanQuery[]{quick, brown, dog};
110 SpanNearQuery snq =
111 new SpanNearQuery(quick_brown_dog, 0, true); // 没有匹配的结果
112 assertNoMatches(snq);
113 dumpSpans(snq);
114
115 snq = new SpanNearQuery(quick_brown_dog, 4, true);// 没有匹配的结果
116 assertNoMatches(snq);
117 dumpSpans(snq);
118
119 snq = new SpanNearQuery(quick_brown_dog, 5, true); //
120 assertOnlyBrownFox(snq);
121 dumpSpans(snq);
122
123 // interesting - even a sloppy phrase query would require
124 // more slop to match
125 snq = new SpanNearQuery(new SpanQuery[]{lazy, fox}, 3, false); // 注意这个
126 assertOnlyBrownFox(snq);
127 dumpSpans(snq);
128
129 PhraseQuery pq = new PhraseQuery();
130 pq.add(new Term("f", "lazy"));
131 pq.add(new Term("f", "fox"));
132 pq.setSlop(4);
133 assertNoMatches(pq);
134
135 pq.setSlop(5);
136 assertOnlyBrownFox(pq);
137 }
138
139 public void testSpanNotQuery() throws Exception {
140 SpanNearQuery quick_fox =
141 new SpanNearQuery(new SpanQuery[]{quick, fox}, 1, true);
142 assertBothFoxes(quick_fox);
143 dumpSpans(quick_fox); // 结果看下面(2)
144
145 SpanNotQuery quick_fox_dog = new SpanNotQuery(quick_fox, dog);
146 assertBothFoxes(quick_fox_dog);
147 dumpSpans(quick_fox_dog);
148
149 SpanNotQuery no_quick_red_fox =
150 new SpanNotQuery(quick_fox, red);
151 assertOnlyBrownFox(no_quick_red_fox);
152 dumpSpans(no_quick_red_fox);
153 }
154
155 public void testSpanOrQuery() throws Exception { // 参考结果(3)
156 SpanNearQuery quick_fox =
157 new SpanNearQuery(new SpanQuery[]{quick, fox}, 1, true);
158
159 SpanNearQuery lazy_dog =
160 new SpanNearQuery(new SpanQuery[]{lazy, dog}, 0, true);
161
162 SpanNearQuery sleepy_cat =
163 new SpanNearQuery(new SpanQuery[]{sleepy, cat}, 0, true);
164
165 SpanNearQuery qf_near_ld =
166 new SpanNearQuery(
167 new SpanQuery[]{quick_fox, lazy_dog}, 3, true);
168 assertOnlyBrownFox(qf_near_ld);
169 dumpSpans(qf_near_ld);
170
171 SpanNearQuery qf_near_sc =
172 new SpanNearQuery(
173 new SpanQuery[]{quick_fox, sleepy_cat}, 3, true);
174 dumpSpans(qf_near_sc);
175
176 SpanOrQuery or = new SpanOrQuery(
177 new SpanQuery[]{qf_near_ld, qf_near_sc});
178 assertBothFoxes(or);
179 dumpSpans(or);
180 }
181
182 public void testPlay() throws Exception {
183 SpanOrQuery or = new SpanOrQuery(new SpanQuery[]{quick, fox});
184 dumpSpans(or);
185
186 SpanNearQuery quick_fox =
187 new SpanNearQuery(new SpanQuery[]{quick, fox}, 1, true);
188 SpanFirstQuery sfq = new SpanFirstQuery(quick_fox, 4);
189 dumpSpans(sfq);
190
191 dumpSpans(new SpanTermQuery(new Term("f", "the")));
192
193 SpanNearQuery quick_brown =
194 new SpanNearQuery(new SpanQuery[]{quick, brown}, 0, false);
195 dumpSpans(quick_brown);
196
197 }
198
199 private void dumpSpans(SpanQuery query) throws IOException {
200 Spans spans = query.getSpans(reader);
201 System.out.println(query + ":");
202 int numSpans = 0;
203
204 Hits hits = searcher.search(query);
205 float[] scores = new float[2];
206 for (int i = 0; i < hits.length(); i++) {
207 scores[hits.id(i)] = hits.score(i);
208 }
209
210 while (spans.next()) {
211 numSpans++;
212
213 int id = spans.doc();
214 Document doc = reader.document(id);
215
216 // for simplicity - assume tokens are in sequential,
217 // positions, starting from 0
218 Token[] tokens = AnalyzerUtils.tokensFromAnalysis(
219 analyzer, doc.get("f"));
220 StringBuffer buffer = new StringBuffer();
221 buffer.append(" ");
222 for (int i = 0; i < tokens.length; i++) {
223 if (i == spans.start()) {
224 buffer.append("<");
225 }
226 buffer.append(tokens[i].termText());
227 if (i + 1 == spans.end()) {
228 buffer.append(">");
229 }
230 buffer.append(" ");
231 }
232 buffer.append("(" + scores[id] + ") ");
233 System.out.println(buffer);
234 // System.out.println(searcher.explain(query, id));
235 }
236
237 if (numSpans == 0) {
238 System.out.println(" No spans");
239 }
240 System.out.println();
241 }
242 }
输出结果(1):
dumpSpans(brown); 的结果如下.
f:brown:
the quick <brown> fox jumps over the lazy dog (0.22097087)
如果调用该函数dumpSpans(new SpanTermQuery(new Term("f", "the")));结果如下
f:the:
<the>
quick brown fox jumps over the lazy dog (0.18579213)the quick brown fox jumps over
<the> lazy dog (0.18579213)<the>
quick red fox jumps over the sleepy cat (0.18579213)the quick red fox jumps over
<the> sleepy cat (0.18579213)(2)
SpanNearQuery quick_fox
=new SpanNearQuery(new SpanQuery[]{quick, fox}, 1, true);
spanNear([f:quick, f:fox], 1, true):
the <quick brown fox> jumps over the lazy dog (0.18579213)
the <quick red fox> jumps over the sleepy cat (0.18579213)
SpanNotQuery quick_fox
_dog = new SpanNotQuery(quick_fox, dog);spanNot(spanNear([f:quick, f:fox], 1, true), f:dog):
the <quick brown fox> jumps over the lazy dog (0.18579213)
the <quick red fox> jumps over the sleepy cat (0.18579213)
SpanNotQuery no_quick_red_fox
=new SpanNotQuery(quick_fox, red);
spanNot(spanNear([f:quick, f:fox], 1, true), f:red):
the <quick brown fox> jumps over the lazy dog (0.18579213)
(3)
SpanNearQuery qf_near_ld =
new SpanNearQuery(
new SpanQuery[]{quick_fox, lazy_dog}, 3, true);
spanNear([spanNear([f:quick, f:fox], 1, true),
spanNear([f:lazy, f:dog], 0, true)], 3, true):
the <quick brown fox jumps over the lazy dog> (0.3321948)
SpanNearQuery qf_near_sc =
new SpanNearQuery(
new SpanQuery[]{quick_fox, sleepy_cat}, 3, true);
spanNear([spanNear([f:quick, f:fox], 1, true),
spanNear([f:sleepy, f:cat], 0, true)], 3, true):
the <quick red fox jumps over the sleepy cat> (0.3321948)
SpanOrQuery
or = new SpanOrQuery(new SpanQuery[]{qf_near_ld, qf_near_sc});
spanOr([spanNear([spanNear([f:quick, f:fox], 1, true),
spanNear([f:lazy, f:dog], 0, true)], 3, true),
spanNear([spanNear([f:quick, f:fox], 1, true),
spanNear([f:sleepy, f:cat], 0, true)], 3, true)]):
the <quick brown fox jumps over the lazy dog> (0.6643896)
the <quick red fox jumps over the sleepy cat> (0.6643896)
关于SpanQuery的详细信息请参考Lucene in Action
V. 关于Filter
Filter顾名思义 就是用来过虑搜索结果的,在Lucene中有3个过虑器.
■
DateFilter constrains the document space to only documents with a specified date field within a given range of dates.■
QueryFilter uses the results of query as the searchable document space for a new query.■
CachingWrapperFilter is a decorator over another filter caching its results to increase performance when used again.Filter的使用是很简单的 看个例子.
001 package lia.advsearching;
002
003 import lia.common.LiaTestCase;
004 import org.apache.lucene.document.DateField;
005 import org.apache.lucene.index.Term;
006 import org.apache.lucene.search.BooleanQuery;
007 import org.apache.lucene.search.CachingWrapperFilter;
008 import org.apache.lucene.search.DateFilter;
009 import org.apache.lucene.search.Filter;
010 import org.apache.lucene.search.Hits;
011 import org.apache.lucene.search.IndexSearcher;
012 import org.apache.lucene.search.Query;
013 import org.apache.lucene.search.QueryFilter;
014 import org.apache.lucene.search.RangeQuery;
015 import org.apache.lucene.search.TermQuery;
016
017 import java.util.Date;
018
019 public class FilterTest extends LiaTestCase {
020 private Query allBooks;
021 private IndexSearcher searcher;
022 private int numAllBooks;
023 private CachingWrapperFilter cachingFilter;
024
025 protected void setUp() throws Exception {
026 super.setUp();
027
028 allBooks = new RangeQuery(new Term("pubmonth","190001"),
029 new Term("pubmonth", "200512"),
030 true);
031 searcher = new IndexSearcher(directory);
032 Hits hits = searcher.search(allBooks);
033 numAllBooks = hits.length();
034 }
035
036 public void testDateFilter() throws Exception {
037 Date jan1 = parseDate("2004-01-01");
038 Date jan31 = parseDate("2004-01-31");
039 Date dec31 = parseDate("2004-12-31");
040
041 DateFilter filter = new DateFilter("modified", jan1, dec31); // 创建DateFilter
042
043 Hits hits = searcher.search(allBooks, filter); // 用Filter来搜索
044 assertEquals("all modified in 2004",
045 numAllBooks, hits.length());
046
047 filter = new DateFilter("modified", jan1, jan31); // 用Filter来搜索
048 hits = searcher.search(allBooks, filter);
049 assertEquals("none modified in January",
050 0, hits.length());
051 }
052
053 public void testQueryFilter() throws Exception {
054 TermQuery categoryQuery =
055 new TermQuery(new Term("category", "/philosophy/eastern"));
056
057 Filter categoryFilter = new QueryFilter(categoryQuery);////QueryFilter
058
059 Hits hits = searcher.search(allBooks, categoryFilter);
060 assertEquals("only tao te ching", 1, hits.length());
061 }
062
063 public void testFilterAlternative() throws Exception { //用BooleanQuery实现 QueryFilter
064 TermQuery categoryQuery =
065 new TermQuery(new Term("category", "/philosophy/eastern"));
066
067 BooleanQuery constrainedQuery = new BooleanQuery();
068 constrainedQuery.add(allBooks, true, false);
069 constrainedQuery.add(categoryQuery, true, false);
070
071 Hits hits = searcher.search(constrainedQuery);
072 assertEquals("only tao te ching", 1, hits.length());
073 }
074
075
076 public void testQueryFilterWithRangeQuery() throws Exception {
077 Date jan1 = parseDate("2004-01-01");
078 Date dec31 = parseDate("2004-12-31");
079
080 Term start = new Term("modified",
081 DateField.dateToString(jan1));
082 Term end = new Term("modified",
083 DateField.dateToString(dec31));
084
085 Query rangeQuery = new RangeQuery(start, end, true);
086
087 Filter filter = new QueryFilter(rangeQuery);
088 Hits hits = searcher.search(allBooks, filter); // 过虑
089 assertEquals("all of 'em", numAllBooks, hits.length());
090 }
091
092 public void testCachingWrapper() throws Exception { // 当重用搜索结果时候 使用CachingWrapperFilter 可以提供性能
093 Date jan1 = parseDate("2004-01-01");
094 Date dec31 = parseDate("2004-12-31");
095
096 DateFilter dateFilter =
097 new DateFilter("modified", jan1, dec31);
098
099 cachingFilter =
100 new CachingWrapperFilter(dateFilter); // 创造 CachingWrapperFilter
101 Hits hits = searcher.search(allBooks, cachingFilter);
102 assertEquals("all of 'em", numAllBooks, hits.length());
103 }
104 }
关于dateFilter 还可以使用 限定一端的date的Filter如下:
filter =
DateFilter.Before("modified", endDate);filter =
DateFilter.After("modified", startDate);可以参考Filter 的源码来写自己的Filter 还可以使用Lucene Sandbox中的Filter.
VI.在多个index中搜索.
这在应用中可能是经常使用的.也是很容易使用的只要利用MultiSearcher 来代替IndexSearcher就可以了.如下所示
searchers = new IndexSearcher[2];
searchers[0] = new IndexSearcher(aTOmDirectory);
searchers[1] = new IndexSearcher(nTOzDirectory);
MultiSearcher searcher = new MultiSearcher(searchers);
完整测试代码点击此处.
VII.Leveraging term vectors
这也是1.4中新加的东东. 好像不怎么用到 有兴趣时再看吧.
