Skip to content

Commit f701dd9

Browse files
committed
add must not search in indexer.go and add some test cases.
1 parent e1b46af commit f701dd9

2 files changed

Lines changed: 195 additions & 13 deletions

File tree

core/indexer.go

Lines changed: 89 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -141,15 +141,15 @@ func (indexer *Indexer) Lookup(
141141
}
142142
numDocs = 0
143143

144-
// 合并关键词和标签为搜索键
145-
keywords := make([]string, len(tokens)+len(labels))
146-
copy(keywords, tokens)
147-
copy(keywords[len(tokens):], labels)
144+
mustKeywords, mustTokensLength, mustNotKeywords, isValid := getProssedQueries(tokens, labels)
145+
if !isValid {
146+
return
147+
}
148148

149149
indexer.tableLock.RLock()
150150
defer indexer.tableLock.RUnlock()
151-
table := make([]*KeywordIndices, len(keywords))
152-
for i, keyword := range keywords {
151+
table := make([]*KeywordIndices, len(mustKeywords))
152+
for i, keyword := range mustKeywords {
153153
indices, found := indexer.tableLock.table[keyword]
154154
if !found {
155155
// 当反向索引表中无此搜索键时直接返回
@@ -160,6 +160,15 @@ func (indexer *Indexer) Lookup(
160160
}
161161
}
162162

163+
// 保存must not搜索键
164+
mustNotTable := make([]*KeywordIndices, 0)
165+
for _, keyword := range mustNotKeywords {
166+
indices, found := indexer.tableLock.table[keyword]
167+
if found {
168+
mustNotTable = append(mustNotTable, indices)
169+
}
170+
}
171+
163172
// 当没有找到时直接返回
164173
if len(table) == 0 {
165174
return
@@ -171,6 +180,7 @@ func (indexer *Indexer) Lookup(
171180
for iTable := 0; iTable < len(table); iTable++ {
172181
indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
173182
}
183+
174184
// 平均文本关键词长度,用于计算BM25
175185
avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
176186
for ; indexPointers[0] >= 0; indexPointers[0]-- {
@@ -186,7 +196,9 @@ func (indexer *Indexer) Lookup(
186196
continue
187197
}
188198
}
199+
189200
iTable := 1
201+
190202
found := true
191203
for ; iTable < len(table); iTable++ {
192204
// 二分法比简单的顺序归并效率高,也有更高效率的算法,
@@ -196,7 +208,11 @@ func (indexer *Indexer) Lookup(
196208
position, foundBaseDocId := indexer.searchIndex(table[iTable],
197209
0, indexPointers[iTable], baseDocId)
198210
if foundBaseDocId {
199-
indexPointers[iTable] = position
211+
if !indexer.findInMustNotTable(mustNotTable, baseDocId) {
212+
indexPointers[iTable] = position
213+
} else {
214+
found = false
215+
}
200216
} else {
201217
if position == 0 {
202218
// 该搜索键中所有的文档ID都比baseDocId大,因此已经没有
@@ -211,19 +227,27 @@ func (indexer *Indexer) Lookup(
211227
}
212228
}
213229

230+
// 如果搜索键只返回一个反向表, 并且存在逻辑非搜索键
231+
// 则需要判断baseDocId是不是在逻辑非反向表中
232+
if len(table) == 1 && len(mustNotTable) > 0 {
233+
if indexer.findInMustNotTable(mustNotTable, baseDocId) {
234+
found = false
235+
}
236+
}
237+
214238
if found {
215239
indexedDoc := types.IndexedDocument{}
216240

217241
// 当为LocationsIndex时计算关键词紧邻距离
218242
if indexer.initOptions.IndexType == types.LocationsIndex {
219243
// 计算有多少关键词是带有距离信息的
220244
numTokensWithLocations := 0
221-
for i, t := range table[:len(tokens)] {
245+
for i, t := range table[:mustTokensLength] {
222246
if len(t.locations[indexPointers[i]]) > 0 {
223247
numTokensWithLocations++
224248
}
225249
}
226-
if numTokensWithLocations != len(tokens) {
250+
if numTokensWithLocations != mustTokensLength {
227251
if !countDocsOnly {
228252
docs = append(docs, types.IndexedDocument{
229253
DocId: baseDocId,
@@ -234,13 +258,13 @@ func (indexer *Indexer) Lookup(
234258
}
235259

236260
// 计算搜索键在文档中的紧邻距离
237-
tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
261+
tokenProximity, tokenLocations := computeTokenProximity(table[:mustTokensLength], indexPointers, mustKeywords[:mustTokensLength])
238262
indexedDoc.TokenProximity = int32(tokenProximity)
239263
indexedDoc.TokenSnippetLocations = tokenLocations
240264

241265
// 添加TokenLocations
242-
indexedDoc.TokenLocations = make([][]int, len(tokens))
243-
for i, t := range table[:len(tokens)] {
266+
indexedDoc.TokenLocations = make([][]int, mustTokensLength)
267+
for i, t := range table[:mustTokensLength] {
244268
indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
245269
}
246270
}
@@ -250,7 +274,7 @@ func (indexer *Indexer) Lookup(
250274
indexer.initOptions.IndexType == types.FrequenciesIndex {
251275
bm25 := float32(0)
252276
d := indexer.docTokenLengths[baseDocId]
253-
for i, t := range table[:len(tokens)] {
277+
for i, t := range table[:mustTokensLength] {
254278
var frequency float32
255279
if indexer.initOptions.IndexType == types.LocationsIndex {
256280
frequency = float32(len(t.locations[indexPointers[i]]))
@@ -423,3 +447,55 @@ func (indexer *Indexer) RemoveDoc(docId uint64) {
423447
indexer.numDocuments--
424448
indexer.tableLock.Unlock()
425449
}
450+
451+
func getProssedQueries(tokens []string, labels []string) (
452+
[]string, int, []string, bool) {
453+
mustTokensLength := 0
454+
mustKeywords := make([]string, 0)
455+
mustNotKeywords := make([]string, 0)
456+
457+
for _, v := range tokens {
458+
if len(v) > 0 && v[0:1] == "+" {
459+
mustKeywords = append(mustKeywords, v[1:])
460+
mustTokensLength++
461+
}
462+
if len(v) > 0 && v[0:1] == "-" {
463+
mustNotKeywords = append(mustNotKeywords, v[1:])
464+
}
465+
if len(v) > 0 && v[:1] != "+" && v[:1] != "-" {
466+
mustKeywords = append(mustKeywords, v)
467+
mustTokensLength++
468+
}
469+
}
470+
471+
for _, v := range labels {
472+
if len(v) > 0 && v[0:1] == "+" {
473+
mustKeywords = append(mustKeywords, v[1:])
474+
}
475+
if len(v) > 0 && v[0:1] == "-" {
476+
mustNotKeywords = append(mustNotKeywords, v[1:])
477+
}
478+
if len(v) > 0 && v[:1] != "+" && v[:1] != "-" {
479+
mustKeywords = append(mustKeywords, v)
480+
}
481+
}
482+
483+
if mustTokensLength == 0 && len(mustNotKeywords) > 0 {
484+
// 不能只包含非搜索键
485+
return mustKeywords, mustTokensLength, mustNotKeywords, false
486+
}
487+
return mustKeywords, mustTokensLength, mustNotKeywords, true
488+
}
489+
490+
// 在must not table中查找docId
491+
// 返回: 找到: true, 未找到: false
492+
func (indexer *Indexer) findInMustNotTable(table []*KeywordIndices, docId uint64) bool {
493+
for i := 0; i < len(table); i++ {
494+
_, foundDocId := indexer.searchIndex(table[i],
495+
0, indexer.getIndexLength(table[i])-1, docId)
496+
if foundDocId {
497+
return true
498+
}
499+
}
500+
return false
501+
}

core/indexer_test.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,3 +370,109 @@ func TestLookupWithLocations(t *testing.T) {
370370
docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
371371
utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
372372
}
373+
374+
func TestLookupWithMustNot(t *testing.T) {
375+
var indexer Indexer
376+
indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
377+
// doc0 = "token2 token4 token4 token2 token3 token4"
378+
indexer.AddDocument(&types.DocumentIndex{
379+
DocId: 0,
380+
Keywords: []types.KeywordIndex{
381+
{"token2", 0, []int{0, 21}},
382+
{"token3", 0, []int{28}},
383+
{"token4", 0, []int{7, 14, 35}},
384+
},
385+
})
386+
387+
docs, num := indexer.Lookup([]string{"+token2", "-token3"}, []string{}, nil, false)
388+
utils.Expect(t, "0", num)
389+
if len(docs) == 0 {
390+
t.Log("Correct!, 0 docs returned.")
391+
}
392+
393+
_, num = indexer.Lookup([]string{"+token2", "token4", "-token3"}, []string{}, nil, false)
394+
utils.Expect(t, "0", num)
395+
396+
_, num = indexer.Lookup([]string{"+token2", "-token4", "-token3"}, []string{}, nil, false)
397+
utils.Expect(t, "0", num)
398+
}
399+
400+
func TestLookupWithMustNotMulti(t *testing.T) {
401+
var indexer Indexer
402+
indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
403+
// doc0 = "token2 token3"
404+
indexer.AddDocument(&types.DocumentIndex{
405+
DocId: 0,
406+
Keywords: []types.KeywordIndex{
407+
{"token2", 0, []int{0}},
408+
{"token3", 0, []int{7}},
409+
},
410+
})
411+
// doc1 = "token1 token2 token3"
412+
indexer.AddDocument(&types.DocumentIndex{
413+
DocId: 1,
414+
Keywords: []types.KeywordIndex{
415+
{"token1", 0, []int{0}},
416+
{"token2", 0, []int{7}},
417+
{"token3", 0, []int{14}},
418+
},
419+
})
420+
// doc2 = "token1 token2"
421+
indexer.AddDocument(&types.DocumentIndex{
422+
DocId: 2,
423+
Keywords: []types.KeywordIndex{
424+
{"token1", 0, []int{0}},
425+
{"token2", 0, []int{7}},
426+
},
427+
})
428+
// doc3 = "token2"
429+
indexer.AddDocument(&types.DocumentIndex{
430+
DocId: 3,
431+
Keywords: []types.KeywordIndex{
432+
{"token2", 0, []int{0}},
433+
},
434+
})
435+
// doc7 = "token1 token3"
436+
indexer.AddDocument(&types.DocumentIndex{
437+
DocId: 7,
438+
Keywords: []types.KeywordIndex{
439+
{"token1", 0, []int{0}},
440+
{"token3", 0, []int{7}},
441+
},
442+
})
443+
// doc9 = "token3"
444+
indexer.AddDocument(&types.DocumentIndex{
445+
DocId: 9,
446+
Keywords: []types.KeywordIndex{
447+
{"token3", 0, []int{0}},
448+
},
449+
})
450+
451+
utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
452+
utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
453+
utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
454+
455+
utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"+token4"}, []string{}, nil, false)))
456+
457+
utils.Expect(t, "[7 0 []] [2 0 []] [1 0 []] ",
458+
indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
459+
utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "+token4"}, []string{}, nil, false)))
460+
461+
utils.Expect(t, "[2 0 []] [1 0 []] ",
462+
indexedDocsToString(indexer.Lookup([]string{"+token1", "token2"}, []string{}, nil, false)))
463+
464+
utils.Expect(t, "[2 0 []] [1 0 []] ",
465+
indexedDocsToString(indexer.Lookup([]string{"+token2", "+token1"}, []string{}, nil, false)))
466+
467+
utils.Expect(t, "[7 0 []] ",
468+
indexedDocsToString(indexer.Lookup([]string{"token1", "-token2"}, []string{}, nil, false)))
469+
470+
utils.Expect(t, "[3 0 []] [2 0 []] ",
471+
indexedDocsToString(indexer.Lookup([]string{"token2", "-token3"}, []string{}, nil, false)))
472+
473+
utils.Expect(t, "[3 0 []] ",
474+
indexedDocsToString(indexer.Lookup([]string{"token2", "-token3", "-token1"}, []string{}, nil, false)))
475+
476+
utils.Expect(t, "",
477+
indexedDocsToString(indexer.Lookup([]string{"-token2", "-token3", "-token1"}, []string{}, nil, false)))
478+
}

0 commit comments

Comments
 (0)