IDZEBRA  2.2.7
ranksimilarity.c
Go to the documentation of this file.
1 /* This file is part of the Zebra server.
2  Copyright (C) Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
18 */
19 
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 #include <stdio.h>
24 #include <assert.h>
25 #include <limits.h>
26 #ifdef WIN32
27 #include <io.h>
28 #endif
29 #if HAVE_UNISTD_H
30 #include <unistd.h>
31 #endif
32 
33 #include "index.h"
34 #include "rank.h"
35 
36 static int log_level = 0;
37 static int log_initialized = 0;
38 
40  int dummy;
41 };
42 
45 
48 
51 
54 
57 
60 
62  int rank_flag;
63 
66 
69 
72 };
73 
75  int last_pos;
76 
79 
82 
85 
88 
91 
92  NMEM nmem;
93 };
94 
95 
96 /* local clean-up function */
98 {
99  int i;
100 
101  for (i = 0; i < si->no_terms_query; i++)
102  {
103  si->entries[i].freq_term_docfield = 0;
104  }
105 }
106 
107 
108 /*
109  * create: Creates/Initialises this rank handler. This routine is
110  * called exactly once. The routine returns the class_handle.
111  */
112 static void *create (ZebraHandle zh)
113 {
114  struct ranksimilarity_class_info *ci =
115  (struct ranksimilarity_class_info *) xmalloc (sizeof(*ci));
116 
117  if (!log_initialized)
118  {
119  log_level = yaz_log_module_level("rank-similarity");
120  log_initialized = 1;
121  }
122  yaz_log(log_level, "create()");
123  return 0;
124 }
125 
126 /*
127  * destroy: Destroys this rank handler. This routine is called
128  * when the handler is no longer needed - i.e. when the server
129  * dies. The class_handle was previously returned by create.
130  */
131 static void destroy (struct zebra_register *reg, void *class_handle)
132 {
133  struct ranksimilarity_class_info *ci
134  = (struct ranksimilarity_class_info *) class_handle;
135  yaz_log(log_level, "destroy()");
136  xfree (ci);
137 }
138 
139 
145 static void *begin (struct zebra_register *reg,
146  void *class_handle, RSET rset, NMEM nmem,
147  TERMID *terms, int numterms)
148 {
149  struct ranksimilarity_set_info *si =
150  (struct ranksimilarity_set_info *) nmem_malloc (nmem, sizeof(*si));
151  int i;
152 
153  yaz_log(log_level, "begin() numterms=%d", numterms);
154 
155  /* setting database global statistics */
156  si->no_docs_database = -1; /* TODO */
157  si->no_terms_database = -1; /* TODO */
158 
159  /* setting query statistics */
160  si->no_terms_query = numterms;
161  si->no_ranked_terms_query = 0;
162 
163  /* setting internal data structures */
164  si->nmem=nmem;
165  si->entries = (struct ranksimilarity_term_info *)
166  nmem_malloc (si->nmem, sizeof(*si->entries)*numterms);
167 
168  /* reset the counts for the next term */
170 
171 
172  /* looping all terms in a specific fieldindex of query */
173  for (i = 0; i < numterms; i++)
174  {
175  struct ord_list *ol = NULL;
176 
177 
178  /* adding to number of rank entries */
179  if (strncmp (terms[i]->flags, "rank,", 5))
180  {
181  si->entries[i].rank_flag = 0;
182  yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s not ranked",
183  i, terms[i]->name, terms[i]->flags);
184  }
185  else
186  {
187  const char *cp = strstr(terms[i]->flags+4, ",w=");
188 
189  zint no_docs_fieldindex = 0;
190  zint no_terms_fieldindex = 0;
191 
192  yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s",
193  i, terms[i]->name, terms[i]->flags);
194 
195  (si->no_ranked_terms_query)++;
196  ol = terms[i]->ol;
197 
198  si->entries[i].rank_flag = 1;
199  /* notice that the call to rset_count(rset) has he side-effect
200  of setting rset->hits_limit = rset_count(rset) ??? */
201  si->entries[i].freq_term_resset = rset_count(terms[i]->rset);
202  si->entries[i].no_docs_resset = terms[i]->rset->hits_count;
203 
204 
205  if (cp)
206  si->entries[i].fieldindex_weight = atoi (cp+3);
207  else
208  si->entries[i].fieldindex_weight = 34; /* sqrroot of 1000 */
209 
210 
211  /*
212  yaz_log(log_level, "begin() rset_count(terms[%d]->rset) = "
213  ZINT_FORMAT, i, rset_count(terms[i]->rset));
214  yaz_log(log_level, "begin() terms[%d]->rset->hits_limit = "
215  ZINT_FORMAT, i, terms[i]->rset->hits_limit);
216  yaz_log(log_level, "begin() terms[%d]->rset->hits_count = "
217  ZINT_FORMAT, i, terms[i]->rset->hits_count);
218  yaz_log(log_level, "begin() terms[%d]->rset->hits_round = "
219  ZINT_FORMAT, i, terms[i]->rset->hits_round);
220  yaz_log(log_level, "begin() terms[%d]->rset->hits_approx = %d",
221  i, terms[i]->rset->hits_approx);
222  */
223 
224  /* looping indexes where term terms[i] is found */
225 
226  for (; ol; ol = ol->next)
227  {
228  const char *index_type = 0;
229  const char *db = 0;
230  const char *string_index = 0;
231 
233  ol->ord, &index_type, &db,
234  &string_index);
235 
236  no_docs_fieldindex
238  no_terms_fieldindex
240 
241  if (string_index)
242  yaz_log(log_level,
243  "begin() index: ord=%d type=%s db=%s str-index=%s",
244  ol->ord, index_type, db, string_index);
245  else
246  yaz_log(log_level,
247  "begin() index: ord=%d type=%s db=%s",
248  ol->ord, index_type, db);
249  }
250 
251  si->entries[i].no_docs_fieldindex = no_docs_fieldindex;
252  si->entries[i].no_terms_fieldindex = no_terms_fieldindex;
253  }
254 
255  si->entries[i].term = terms[i];
256  si->entries[i].term_index=i;
257 
258  /* setting next entry in term */
259  terms[i]->rankpriv = &(si->entries[i]);
260  }
261 
262  return si;
263 }
264 
265 /*
266  * end: Terminates ranking process. Called after a result set
267  * has been ranked.
268  */
269 static void end (struct zebra_register *reg, void *set_handle)
270 {
271  yaz_log(log_level, "end()");
272 }
273 
274 
275 
281 static void add (void *set_handle, int seqno, TERMID term)
282 {
283  struct ranksimilarity_set_info *si
284  = (struct ranksimilarity_set_info *) set_handle;
285  struct ranksimilarity_term_info *ti;
286  assert(si);
287  if (!term)
288  {
289  /* yaz_log(log_level, "add() seqno=%d NULL term", seqno); */
290  return;
291  }
292 
293  ti= (struct ranksimilarity_term_info *) term->rankpriv;
294  assert(ti);
295  si->last_pos = seqno;
296  ti->freq_term_docfield++;
297  /*yaz_log(log_level, "add() seqno=%d term=%s freq_term_docfield=%d",
298  seqno, term->name, ti->freq_term_docfield); */
299 }
300 
301 /*
302  * calc: Called for each document in a result. This handler should
303  * produce a score based on previous call(s) to the add handler. The
304  * score should be between 0 and 1000. If score cannot be obtained
305  * -1 should be returned.
306  */
307 static int calc (void *set_handle, zint sysno, zint staticrank,
308  int *stop_flag)
309 {
310  int i, score = 0;
311  struct ranksimilarity_set_info *si
312  = (struct ranksimilarity_set_info *) set_handle;
313 
314 
315  yaz_log(log_level, "calc() sysno = " ZINT_FORMAT, sysno);
316  yaz_log(log_level, "calc() staticrank = " ZINT_FORMAT, staticrank);
317 
318  yaz_log(log_level, "calc() si->no_terms_query = %d",
319  si->no_terms_query);
320  yaz_log(log_level, "calc() si->no_ranked_terms_query = %d",
322  yaz_log(log_level, "calc() si->no_docs_database = " ZINT_FORMAT,
323  si->no_docs_database);
324  yaz_log(log_level, "calc() si->no_terms_database = " ZINT_FORMAT,
325  si->no_terms_database);
326 
327 
328  if (!si->no_ranked_terms_query)
329  return -1; /* ranking not enabled for any terms */
330 
331 
332  /* if we set *stop_flag = 1, we stop processing (of result set list) */
333 
334 
335  /* here goes your formula to compute a scoring function */
336  /* you may use all the gathered statistics here */
337  for (i = 0; i < si->no_terms_query; i++)
338  {
339  yaz_log(log_level, "calc() entries[%d] termid %p",
340  i, si->entries[i].term);
341  if (si->entries[i].term){
342  yaz_log(log_level, "calc() entries[%d] term '%s' flags=%s",
343  i, si->entries[i].term->name, si->entries[i].term->flags);
344  yaz_log(log_level, "calc() entries[%d] rank_flag %d",
345  i, si->entries[i].rank_flag );
346  yaz_log(log_level, "calc() entries[%d] fieldindex_weight %d",
347  i, si->entries[i].fieldindex_weight );
348  yaz_log(log_level, "calc() entries[%d] freq_term_docfield %d",
349  i, si->entries[i].freq_term_docfield );
350  yaz_log(log_level, "calc() entries[%d] freq_term_resset " ZINT_FORMAT,
351  i, si->entries[i].freq_term_resset );
352  yaz_log(log_level, "calc() entries[%d] no_docs_resset " ZINT_FORMAT,
353  i, si->entries[i].no_docs_resset );
354  yaz_log(log_level, "calc() entries[%d] no_docs_fieldindex "
355  ZINT_FORMAT,
356  i, si->entries[i].no_docs_fieldindex );
357  yaz_log(log_level, "calc() entries[%d] no_terms_fieldindex "
358  ZINT_FORMAT,
359  i, si->entries[i].no_terms_fieldindex );
360  }
361  }
362 
363 
364  /* reset the counts for the next term */
366 
367 
368  /* staticrank = 0 is highest, MAXINT lowest */
369  if (staticrank >= INT_MAX)
370  score = 0;
371  else
372  { /* but score is reverse (logical) */
373  score = INT_MAX - CAST_ZINT_TO_INT(staticrank);
374  }
375 
376 
377  /* debugging statistics output */
378  yaz_log(log_level, "calc() statistics: score = %d", score);
379 
380  return score;
381 }
382 
383 /*
384  * Pseudo-meta code with sequence of calls as they occur in a
385  * server. Handlers are prefixed by --:
386  *
387  * server init
388  * -- create
389  * foreach search
390  * rank result set
391  * -- begin
392  * foreach record
393  * foreach word
394  * -- add
395  * -- calc
396  * -- end
397  * -- destroy
398  * server close
399  */
400 
401 static struct rank_control rank_control = {
402  "rank-similarity",
403  create,
404  destroy,
405  begin,
406  end,
407  calc,
408  add,
409 };
410 
412 /*
413  * Local variables:
414  * c-basic-offset: 4
415  * c-file-style: "Stroustrup"
416  * indent-tabs-mode: nil
417  * End:
418  * vim: shiftwidth=4 tabstop=8 expandtab
419  */
420 
static void ranksimilar_rec_reset(struct ranksimilarity_set_info *si)
static void * begin(struct zebra_register *reg, void *class_handle, RSET rset, NMEM nmem, TERMID *terms, int numterms)
static struct rank_control rank_control
static void end(struct zebra_register *reg, void *set_handle)
static void * create(ZebraHandle zh)
static int log_initialized
struct rank_control * rank_similarity_class
static void destroy(struct zebra_register *reg, void *class_handle)
static int log_level
static int calc(void *set_handle, zint sysno, zint staticrank, int *stop_flag)
static void add(void *set_handle, int seqno, TERMID term)
zint rset_count(RSET rs)
Estimates hit count for result set.
Definition: rset.c:272
Definition: rset.h:35
int ord
Definition: rset.h:36
struct ord_list * next
Definition: rset.h:37
struct ranksimilarity_term_info * entries
Definition: rset.h:50
char * flags
Definition: rset.h:52
struct ord_list * ol
Definition: rset.h:64
char * name
Definition: rset.h:51
RSET rset
Definition: rset.h:60
void * rankpriv
Definition: rset.h:61
Definition: rset.h:151
zint hits_count
Definition: rset.h:164
ZebraExplainInfo zei
Definition: index.h:139
long zint
Zebra integer.
Definition: util.h:66
#define ZINT_FORMAT
Definition: util.h:72
#define CAST_ZINT_TO_INT(x)
Definition: util.h:96
zint zebraExplain_ord_get_term_occurrences(ZebraExplainInfo zei, int ord)
Definition: zinfo.c:1470
int zebraExplain_lookup_ord(ZebraExplainInfo zei, int ord, const char **index_type, const char **db, const char **string_index)
Definition: zinfo.c:1478
zint zebraExplain_ord_get_doc_occurrences(ZebraExplainInfo zei, int ord)
Definition: zinfo.c:1462