Lucene In Action ch 5 II 笔记 --高级搜索技术

作者:admin

IV. Span Query

SpanQuery是Lucene1.4中新添加的一个比较有用的Query.她可以保存更多有用的信息.有如下几个子类

SpanQuery type

Description

SpanTermQuery

Used in conjunction with the other span query types. On its own, it抯 functionally equivalent to TermQuery.

SpanFirstQuery

Matches spans that occur within the first part of a field.

SpanNearQuery

Matches spans that occur near one another.

SpanNotQuery

Matches spans that don't overlap one another.

SpanOrQuery

Aggregates matches of span queries.

看个例子,

001 package lia.advsearching;
002 
003 import junit.framework.TestCase;
004 import lia.analysis.AnalyzerUtils;
005 import org.apache.lucene.analysis.Analyzer;
006 import org.apache.lucene.analysis.Token;
007 import org.apache.lucene.analysis.WhitespaceAnalyzer;
008 import org.apache.lucene.document.Document;
009 import org.apache.lucene.document.Field;
010 import org.apache.lucene.index.IndexReader;
011 import org.apache.lucene.index.IndexWriter;
012 import org.apache.lucene.index.Term;
013 import org.apache.lucene.search.Hits;
014 import org.apache.lucene.search.IndexSearcher;
015 import org.apache.lucene.search.PhraseQuery;
016 import org.apache.lucene.search.Query;
017 import org.apache.lucene.search.spans.SpanFirstQuery;
018 import org.apache.lucene.search.spans.SpanNearQuery;
019 import org.apache.lucene.search.spans.SpanNotQuery;
020 import org.apache.lucene.search.spans.SpanOrQuery;
021 import org.apache.lucene.search.spans.SpanQuery;
022 import org.apache.lucene.search.spans.SpanTermQuery;
023 import org.apache.lucene.search.spans.Spans;
024 import org.apache.lucene.store.RAMDirectory;
025 
026 import java.io.IOException;
027 
028 public class SpanQueryTest extends TestCase {
029   private RAMDirectory directory;
030   private IndexSearcher searcher;
031   private IndexReader reader;
032 
033   private SpanTermQuery quick;
034   private SpanTermQuery brown;
035   private SpanTermQuery red;
036   private SpanTermQuery fox;
037   private SpanTermQuery lazy;
038   private SpanTermQuery sleepy;
039   private SpanTermQuery dog;
040   private SpanTermQuery cat;
041   private Analyzer analyzer;
042 
043   protected void setUp() throws Exception {
044     directory = new RAMDirectory();
045 
046     analyzer = new WhitespaceAnalyzer();
047     IndexWriter writer = new IndexWriter(directory,
048         analyzer, true);
049 
050     Document doc = new Document();
051     doc.add(Field.Text("f",
052         "the quick brown fox jumps over the lazy dog")); // 添加doc1
053     writer.addDocument(doc);
054 
055     doc = new Document();
056     doc.add(Field.Text("f",
057         "the quick red fox jumps over the sleepy cat"));// 添加toc2
058     writer.addDocument(doc);
059 
060     writer.close();
061 
062     searcher = new IndexSearcher(directory);
063     reader = IndexReader.open(directory);
064 
065     quick = new SpanTermQuery(new Term("f""quick"));  //构造SpanTermQuery 该类是其他几个类的基础
066     brown = new SpanTermQuery(new Term("f""brown"));
067     red = new SpanTermQuery(new Term("f""red"));
068     fox = new SpanTermQuery(new Term("f""fox"));
069     lazy = new SpanTermQuery(new Term("f""lazy"));
070     sleepy = new SpanTermQuery(new Term("f""sleepy"));
071     dog = new SpanTermQuery(new Term("f""dog"));
072     cat = new SpanTermQuery(new Term("f""cat"));
073   }
074 // 下面是3个帮助测试的函数
075   private void assertOnlyBrownFox(Query query)throws Exception { 
076     Hits hits = searcher.search(query);
077     assertEquals(1, hits.length());
078     assertEquals("wrong doc"0, hits.id(0));
079   }
080 
081   private void assertBothFoxes(Query query) throws Exception {
082     Hits hits = searcher.search(query);
083     assertEquals(2, hits.length());
084   }
085 
086   private void assertNoMatches(Query query) throws Exception {
087     Hits hits = searcher.search(query);
088     assertEquals(0, hits.length());
089   }
090 
091   public void testSpanTermQuery() throws Exception {  // 单个TermQuery在功能上和TermQuery相似
092     assertOnlyBrownFox(brown);
093     dumpSpans(brown);                         /// 参考结果(1)
094   }
095 
096   public void testSpanFirstQuery() throws Exception {  // 在给定的范围搜索
097     SpanFirstQuery sfq = new SpanFirstQuery(brown, 2);  // 前两个
 "the quick brown fox jumps over the lazy dog"
098     assertNoMatches(sfq);
099 
100     dumpSpans(sfq);
101 
102     sfq = new SpanFirstQuery(brown, 3);                  // 前3个
 "the quick brown fox jumps over the lazy dog"
103     dumpSpans(sfq);
104     assertOnlyBrownFox(sfq);
105   }
106 
107   public void testSpanNearQuery() throws Exception { 
108     SpanQuery[] quick_brown_dog =
109         new SpanQuery[]{quick, brown, dog};
110     SpanNearQuery snq =
111         new SpanNearQuery(quick_brown_dog, 0true);  // 没有匹配的结果
112     assertNoMatches(snq);
113     dumpSpans(snq);
114 
115     snq = new SpanNearQuery(quick_brown_dog, 4true);/
/ 没有匹配的结果
116     assertNoMatches(snq);
117     dumpSpans(snq);
118 
119     snq = new SpanNearQuery(quick_brown_dog, 5true); //
120     assertOnlyBrownFox(snq);
121     dumpSpans(snq);
122 
123     // interesting - even a sloppy phrase query would require
124     // more slop to match
125     snq = new SpanNearQuery(new SpanQuery[]{lazy, fox}, 3, false);  // 注意这个
126     assertOnlyBrownFox(snq);
127     dumpSpans(snq);
128 
129     PhraseQuery pq = new PhraseQuery();
130     pq.add(new Term("f""lazy"));
131     pq.add(new Term("f""fox"));
132     pq.setSlop(4);
133     assertNoMatches(pq);
134 
135     pq.setSlop(5);
136     assertOnlyBrownFox(pq);
137   }
138 
139   public void testSpanNotQuery() throws Exception { 
140     SpanNearQuery quick_fox =
141         new SpanNearQuery(new SpanQuery[]{quick, fox}, 1true);
142     assertBothFoxes(quick_fox);
143     dumpSpans(quick_fox);                           // 结果看下面(2)
144 
145     SpanNotQuery quick_fox_dog = new SpanNotQuery(quick_fox, dog);
146     assertBothFoxes(quick_fox_dog);
147     dumpSpans(quick_fox_dog);
148 
149     SpanNotQuery no_quick_red_fox =
150         new SpanNotQuery(quick_fox, red);
151     assertOnlyBrownFox(no_quick_red_fox);
152     dumpSpans(no_quick_red_fox);
153   }
154 
155   public void testSpanOrQuery() throws Exception {    // 参考结果(3)
156     SpanNearQuery quick_fox =
157         new SpanNearQuery(new SpanQuery[]{quick, fox}, 1true);
158 
159     SpanNearQuery lazy_dog =
160         new SpanNearQuery(new SpanQuery[]{lazy, dog}, 0true);
161 
162     SpanNearQuery sleepy_cat =
163         new SpanNearQuery(new SpanQuery[]{sleepy, cat}, 0true);
164 
165     SpanNearQuery qf_near_ld =
166         new SpanNearQuery(
167             new SpanQuery[]{quick_fox, lazy_dog}, 3true);
168     assertOnlyBrownFox(qf_near_ld);
169     dumpSpans(qf_near_ld);
170 
171     SpanNearQuery qf_near_sc =
172         new SpanNearQuery(
173             new SpanQuery[]{quick_fox, sleepy_cat}, 3true);
174     dumpSpans(qf_near_sc);
175 
176     SpanOrQuery or = new SpanOrQuery(
177         new SpanQuery[]{qf_near_ld, qf_near_sc});
178     assertBothFoxes(or);
179     dumpSpans(or);
180   }
181 
182   public void testPlay() throws Exception {
183     SpanOrQuery or = new SpanOrQuery(new SpanQuery[]{quick, fox});
184     dumpSpans(or);
185 
186     SpanNearQuery quick_fox =
187         new SpanNearQuery(new SpanQuery[]{quick, fox}, 1true);
188     SpanFirstQuery sfq = new SpanFirstQuery(quick_fox, 4);
189     dumpSpans(sfq);
190 
191     dumpSpans(new SpanTermQuery(new Term("f""the")));
192 
193     SpanNearQuery quick_brown =
194         new SpanNearQuery(new SpanQuery[]{quick, brown}, 0false);
195     dumpSpans(quick_brown);
196 
197   }
198 
199   private void dumpSpans(SpanQuery query) throws IOException {
200     Spans spans = query.getSpans(reader);
201     System.out.println(query + ":");
202     int numSpans = 0;
203 
204     Hits hits = searcher.search(query);
205     float[] scores = new float[2];
206     for (int i = 0; i < hits.length(); i++) {
207       scores[hits.id(i)] = hits.score(i);
208     }
209 
210     while (spans.next()) {
211       numSpans++;
212 
213       int id = spans.doc();
214       Document doc = reader.document(id);
215 
216       // for simplicity - assume tokens are in sequential,
217       // positions, starting from 0
218       Token[] tokens = AnalyzerUtils.tokensFromAnalysis(
219           analyzer, doc.get("f"));
220       StringBuffer buffer = new StringBuffer();
221       buffer.append("   ");
222       for (int i = 0; i < tokens.length; i++) {
223         if (i == spans.start()) {
224           buffer.append("<");
225         }
226         buffer.append(tokens[i].termText());
227         if (i + == spans.end()) {
228           buffer.append(">");
229         }
230         buffer.append(" ");
231       }
232       buffer.append("(" + scores[id] + ") ");
233       System.out.println(buffer);
234 //      System.out.println(searcher.explain(query, id));
235     }
236 
237     if (numSpans == 0) {
238       System.out.println("   No spans");
239     }
240     System.out.println();
241   }
242 }

输出结果(1):

dumpSpans(brown); 的结果如下.

f:brown:

  the quick <brown> fox jumps over the lazy dog (0.22097087)

 

如果调用该函数dumpSpans(new SpanTermQuery(new Term("f", "the")));结果如下

 

f:the:

  <the> quick brown fox jumps over the lazy dog (0.18579213)

  the quick brown fox jumps over <the> lazy dog (0.18579213)

 

  <the> quick red fox jumps over the sleepy cat (0.18579213)

  the quick red fox jumps over <the> sleepy cat (0.18579213)

 

(2)

SpanNearQuery quick_fox =

 

        new SpanNearQuery(new SpanQuery[]{quick, fox}, 1, true);

 

spanNear([f:quick, f:fox], 1, true):

  the <quick brown fox> jumps over the lazy dog (0.18579213)

  the <quick red fox> jumps over the sleepy cat (0.18579213)

SpanNotQuery quick_fox_dog = new SpanNotQuery(quick_fox, dog);

 

spanNot(spanNear([f:quick, f:fox], 1, true), f:dog):

  the <quick brown fox> jumps over the lazy dog (0.18579213)

  the <quick red fox> jumps over the sleepy cat (0.18579213)

SpanNotQuery no_quick_red_fox =

 

         new SpanNotQuery(quick_fox, red);

 

spanNot(spanNear([f:quick, f:fox], 1, true), f:red):

  the <quick brown fox> jumps over the lazy dog (0.18579213)

 

(3)

SpanNearQuery qf_near_ld =

new SpanNearQuery(

new SpanQuery[]{quick_fox, lazy_dog}, 3, true);

 

spanNear([spanNear([f:quick, f:fox], 1, true),

spanNear([f:lazy, f:dog], 0, true)], 3, true):

  the <quick brown fox jumps over the lazy dog> (0.3321948)

 

SpanNearQuery qf_near_sc =

new SpanNearQuery(

new SpanQuery[]{quick_fox, sleepy_cat}, 3, true);

 

spanNear([spanNear([f:quick, f:fox], 1, true),

spanNear([f:sleepy, f:cat], 0, true)], 3, true):

  the <quick red fox jumps over the sleepy cat> (0.3321948)

SpanOrQuery or = new SpanOrQuery(

 

new SpanQuery[]{qf_near_ld, qf_near_sc});

 

spanOr([spanNear([spanNear([f:quick, f:fox], 1, true),

spanNear([f:lazy, f:dog], 0, true)], 3, true),

spanNear([spanNear([f:quick, f:fox], 1, true),

spanNear([f:sleepy, f:cat], 0, true)], 3, true)]):

  the <quick brown fox jumps over the lazy dog> (0.6643896)

  the <quick red fox jumps over the sleepy cat> (0.6643896)

 

关于SpanQuery的详细信息请参考Lucene in Action

V. 关于Filter

Filter顾名思义 就是用来过虑搜索结果的,在Lucene中有3个过虑器.

DateFilter constrains the document space to only documents with a specified date field within a given range of dates.

 

QueryFilter uses the results of query as the searchable document space for a new query.

 

CachingWrapperFilter is a decorator over another filter caching its results to increase performance when used again.

 

Filter的使用是很简单的 看个例子.

001 package lia.advsearching;
002 
003 import lia.common.LiaTestCase;
004 import org.apache.lucene.document.DateField;
005 import org.apache.lucene.index.Term;
006 import org.apache.lucene.search.BooleanQuery;
007 import org.apache.lucene.search.CachingWrapperFilter;
008 import org.apache.lucene.search.DateFilter;
009 import org.apache.lucene.search.Filter;
010 import org.apache.lucene.search.Hits;
011 import org.apache.lucene.search.IndexSearcher;
012 import org.apache.lucene.search.Query;
013 import org.apache.lucene.search.QueryFilter;
014 import org.apache.lucene.search.RangeQuery;
015 import org.apache.lucene.search.TermQuery;
016 
017 import java.util.Date;
018 
019 public class FilterTest extends LiaTestCase {
020   private Query allBooks;
021   private IndexSearcher searcher;
022   private int numAllBooks;
023   private CachingWrapperFilter cachingFilter;
024 
025   protected void setUp() throws Exception {
026     super.setUp();
027 
028     allBooks = new RangeQuery(new Term("pubmonth","190001"),
029                                    new Term("pubmonth""200512"),
030                                    true);
031     searcher = new IndexSearcher(directory);
032     Hits hits = searcher.search(allBooks);
033     numAllBooks = hits.length();
034   }
035 
036   public void testDateFilter() throws Exception {
037     Date jan1 = parseDate("2004-01-01");
038     Date jan31 = parseDate("2004-01-31");
039     Date dec31 = parseDate("2004-12-31");
040 
041     DateFilter filter = new DateFilter("modified", jan1, dec31);  // 创建DateFilter
042 
043     Hits hits = searcher.search(allBooks, filter);   // 用Filter来搜索
044     assertEquals("all modified in 2004",
045         numAllBooks, hits.length());
046 
047     filter = new DateFilter("modified", jan1, jan31);
// 用Filter来搜索
048     hits = searcher.search(allBooks, filter);
049     assertEquals("none modified in January",
050         0, hits.length());
051   }
052 
053   public void testQueryFilter() throws Exception {
054     TermQuery categoryQuery =
055        new TermQuery(new Term("category""/philosophy/eastern"));
056 
057     Filter categoryFilter = new QueryFilter(categoryQuery);////
QueryFilter
058 
059     Hits hits = searcher.search(allBooks, categoryFilter);
060     assertEquals("only tao te ching"1, hits.length());
061   }
062 
063   public void testFilterAlternative() throws Exception {  //用BooleanQuery实现
QueryFilter
064     TermQuery categoryQuery =
065        new TermQuery(new Term("category""/philosophy/eastern"));
066 
067     BooleanQuery constrainedQuery = new BooleanQuery();
068     constrainedQuery.add(allBooks, true, false);
069     constrainedQuery.add(categoryQuery, true, false);
070 
071     Hits hits = searcher.search(constrainedQuery);
072     assertEquals("only tao te ching"1, hits.length());
073   }
074 
075 
076   public void testQueryFilterWithRangeQuery() throws Exception {
077     Date jan1 = parseDate("2004-01-01");
078     Date dec31 = parseDate("2004-12-31");
079 
080     Term start = new Term("modified",
081         DateField.dateToString(jan1));
082     Term end = new Term("modified",
083         DateField.dateToString(dec31));
084 
085     Query rangeQuery = new RangeQuery(start, end, true);
086 
087     Filter filter = new QueryFilter(rangeQuery);
088     Hits hits = searcher.search(allBooks, filter);    // 过虑
089     assertEquals("all of 'em", numAllBooks, hits.length());
090   }
091 
092   public void testCachingWrapper() throws Exception {   // 当重用搜索结果时候 使用
CachingWrapperFilter 可以提供性能
093     Date jan1 = parseDate("2004-01-01");
094     Date dec31 = parseDate("2004-12-31");
095 
096     DateFilter dateFilter =
097         new DateFilter("modified", jan1, dec31);
098 
099     cachingFilter =
100         new CachingWrapperFilter(dateFilter);   // 创造
CachingWrapperFilter
101     Hits hits = searcher.search(allBooks, cachingFilter);
102     assertEquals("all of 'em", numAllBooks, hits.length());
103   }
104 }

关于dateFilter 还可以使用 限定一端的date的Filter如下:

filter = DateFilter.Before("modified", endDate);

 

filter = DateFilter.After("modified", startDate);

 

可以参考Filter 的源码来写自己的Filter 还可以使用Lucene Sandbox中的Filter.

VI.在多个index中搜索.

这在应用中可能是经常使用的.也是很容易使用的只要利用MultiSearcher 来代替IndexSearcher就可以了.如下所示

searchers = new IndexSearcher[2];

searchers[0] = new IndexSearcher(aTOmDirectory);

searchers[1] = new IndexSearcher(nTOzDirectory);

 

MultiSearcher searcher = new MultiSearcher(searchers);

 

完整测试代码点击此处.

VII.Leveraging term vectors

 

这也是1.4中新加的东东. 好像不怎么用到 有兴趣时再看吧.



来源:Java爱好者 -- J2EE文章精选
录入:admin
阅读:0
日期:2006-8-30 10:25:58

评论(0篇) 】 【 打印 】 【 字体: