pazpar2  1.13.0
termlists.c
Go to the documentation of this file.
1 /* This file is part of Pazpar2.
2  Copyright (C) Index Data
3 
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
18 */
19 
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 
24 #include <assert.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <yaz/yaz-util.h>
28 
29 #include "termlists.h"
30 #include "jenkins_hash.h"
31 
32 // Discussion:
33 // As terms are found in incoming records, they are added to (or updated in) a
34 // Hash table. When term records are updated, a frequency value is updated. At
35 // the same time, a highscore is maintained for the most frequent terms.
36 
38 {
41 };
42 
43 struct termlist
44 {
46  unsigned hash_size;
47 
49  NMEM nmem;
50 };
51 
53 {
54  struct termlist *res = nmem_malloc(nmem, sizeof(struct termlist));
55  res->hash_size = 399;
56  res->hashtable =
57  nmem_malloc(nmem, res->hash_size * sizeof(struct termlist_bucket*));
58  memset(res->hashtable, 0, res->hash_size * sizeof(struct termlist_bucket*));
59  res->nmem = nmem;
60  res->no_entries = 0;
61  return res;
62 }
63 
64 void termlist_insert(struct termlist *tl, const char *display_term,
65  const char *norm_term, const char *id, size_t id_len,
66  int freq)
67 {
68  unsigned int bucket;
69  struct termlist_bucket **p;
70  char buf[256];
71 
72  if (strlen(norm_term) > 255)
73  return;
74  strcpy(buf, norm_term);
75  bucket = jenkins_hash((unsigned char *)buf) % tl->hash_size;
76  for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
77  {
78  if (!strcmp(buf, (*p)->term.norm_term))
79  {
80  (*p)->term.frequency += freq;
81  break;
82  }
83  }
84  if (!*p) // We made it to the end of the bucket without finding match
85  {
86  struct termlist_bucket *new = nmem_malloc(tl->nmem,
87  sizeof(struct termlist_bucket));
88  new->term.norm_term = nmem_strdup(tl->nmem, buf);
89  new->term.display_term = *display_term ?
90  nmem_strdup(tl->nmem, display_term) : new->term.norm_term;
91  new->term.id = id ? nmem_strdupn(tl->nmem, id, id_len) : 0;
92  new->term.frequency = freq;
93  new->next = 0;
94  *p = new;
95  tl->no_entries++;
96  }
97 }
98 
99 static int compare(const void *s1, const void *s2)
100 {
101  struct termlist_score **p1 = (struct termlist_score **) s1;
102  struct termlist_score **p2 = (struct termlist_score **) s2;
103  int d = (*p2)->frequency - (*p1)->frequency;
104  if (d)
105  return d;
106  return strcmp((*p1)->display_term, (*p2)->display_term);
107 }
108 
109 struct termlist_score **termlist_highscore(struct termlist *tl, int *len,
110  NMEM nmem)
111 {
112  struct termlist_score **highscore =
113  (struct termlist_score **)
114  nmem_malloc(nmem, tl->no_entries * sizeof(*highscore));
115 
116  int no = 0;
117  unsigned bucket;
118  for (bucket = 0; bucket < tl->hash_size; bucket++)
119  {
120  struct termlist_bucket *p;
121  for (p = tl->hashtable[bucket]; p; p = p->next)
122  highscore[no++] = &p->term;
123  }
124  assert(no == tl->no_entries);
125  qsort(highscore, tl->no_entries, sizeof(struct termlist_score*), compare);
126  *len = tl->no_entries;
127  return highscore;
128 }
129 
130 /*
131  * Local variables:
132  * c-basic-offset: 4
133  * c-file-style: "Stroustrup"
134  * indent-tabs-mode: nil
135  * End:
136  * vim: shiftwidth=4 tabstop=8 expandtab
137  */
138 
static int compare(const void *s1, const void *s2)
Definition: termlists.c:99
struct termlist * termlist_create(NMEM nmem)
Definition: termlists.c:52
void termlist_insert(struct termlist *tl, const char *display_term, const char *norm_term, const char *id, size_t id_len, int freq)
Definition: termlists.c:64
unsigned hash_size
Definition: termlists.c:46
struct termlist_score term
Definition: termlists.c:39
NMEM nmem
Definition: termlists.c:49
struct termlist_bucket * next
Definition: termlists.c:40
struct termlist_score ** termlist_highscore(struct termlist *tl, int *len, NMEM nmem)
Definition: termlists.c:109
unsigned int jenkins_hash(const unsigned char *key)
Definition: jenkins_hash.c:31
int no_entries
Definition: termlists.c:48
struct termlist_bucket ** hashtable
Definition: termlists.c:45