pazpar2  1.13.0
charsets.c
Go to the documentation of this file.
1 /* This file is part of Pazpar2.
2  Copyright (C) Index Data
3 
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
18 */
19 
24 #if HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 
28 #include <yaz/xmalloc.h>
29 #include <yaz/wrbuf.h>
30 #include <yaz/log.h>
31 #include <yaz/yaz-version.h>
32 #include <yaz/xml_get.h>
33 #include <ctype.h>
34 #include <assert.h>
35 #include <string.h>
36 
37 #include "charsets.h"
38 #include "normalize7bit.h"
39 
40 typedef struct pp2_charset_s *pp2_charset_t;
41 static pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node);
42 static pp2_charset_t pp2_charset_create(void);
43 static pp2_charset_t pp2_charset_create_a_to_z(void);
44 static void pp2_charset_destroy(pp2_charset_t pct);
45 static pp2_charset_token_t pp2_charset_tokenize(pp2_charset_t pct);
46 
47 #if YAZ_HAVE_ICU
48 #include <yaz/icu.h>
49 static pp2_charset_t pp2_charset_create_icu(struct icu_chain *icu_chn);
50 #endif
51 
52 /* charset handle */
53 struct pp2_charset_s {
54  const char *(*token_next_handler)(pp2_charset_token_t prt);
55  const char *(*get_sort_handler)(pp2_charset_token_t prt);
56  const char *(*get_display_handler)(pp2_charset_token_t prt);
58  size_t *start, size_t *len);
59 #if YAZ_HAVE_ICU
60  struct icu_chain * icu_chn;
61  UErrorCode icu_sts;
62 #endif
63 };
64 
65 static const char *pp2_charset_token_null(pp2_charset_token_t prt);
66 static const char *pp2_charset_token_a_to_z(pp2_charset_token_t prt);
67 static const char *pp2_get_sort_ascii(pp2_charset_token_t prt);
68 static const char *pp2_get_display_ascii(pp2_charset_token_t prt);
70  size_t *start, size_t *len);
71 
72 #if YAZ_HAVE_ICU
73 static const char *pp2_charset_token_icu(pp2_charset_token_t prt);
74 static const char *pp2_get_sort_icu(pp2_charset_token_t prt);
75 static const char *pp2_get_display_icu(pp2_charset_token_t prt);
76 static void pp2_get_org_icu(pp2_charset_token_t prt,
77  size_t *start, size_t *len);
78 #endif
79 
80 /* tokenzier handle */
82  const char *cp; /* unnormalized buffer we're tokenizing */
83  const char *last_cp; /* pointer to last token we're dealing with */
84  pp2_charset_t pct; /* our main charset handle (type+config) */
85  WRBUF norm_str; /* normized string we return (temporarily) */
86  WRBUF sort_str; /* sort string we return (temporarily) */
87 #if YAZ_HAVE_ICU
88  yaz_icu_iter_t iter;
89 #endif
90  const char *cp0;
91  size_t start;
92  size_t len;
93 };
94 
97  int ref_count;
98 };
99 
102  pp2_charset_t pct;
103  char *name;
104 };
105 
106 
108  pp2_charset_t pct, const char *default_id);
109 
111 {
112  pp2_charset_fact_t pft = xmalloc(sizeof(*pft));
113  pft->list = 0;
114  pft->ref_count = 1;
115 
116  pp2_charset_fact_add(pft, pp2_charset_create_a_to_z(), "relevance");
119  pp2_charset_fact_add(pft, pp2_charset_create(), "facet");
120  return pft;
121 }
122 
124 {
125  if (pft)
126  {
127  assert(pft->ref_count >= 1);
128  --(pft->ref_count);
129  if (pft->ref_count == 0)
130  {
131  struct pp2_charset_entry *pce = pft->list;
132  while (pce)
133  {
134  struct pp2_charset_entry *next = pce->next;
135  pp2_charset_destroy(pce->pct);
136  xfree(pce->name);
137  xfree(pce);
138  pce = next;
139  }
140  xfree(pft);
141  }
142  }
143 }
144 
146  pp2_charset_t pct, const char *default_id)
147 {
148  struct pp2_charset_entry *pce;
149 
150  for (pce = pft->list; pce; pce = pce->next)
151  if (!strcmp(default_id, pce->name))
152  break;
153 
154  if (!pce)
155  {
156  pce = xmalloc(sizeof(*pce));
157  pce->name = xstrdup(default_id);
158  pce->next = pft->list;
159  pft->list = pce;
160  }
161  else
162  {
163  pp2_charset_destroy(pce->pct);
164  }
165  pce->pct = pct;
166  return 0;
167 }
168 
170  xmlNode *xml_node, const char *id)
171 {
172  int r;
173  pp2_charset_t pct;
174 
175  assert(xml_node);
176 
177  if (strcmp((const char *) xml_node->name, "icu_chain"))
178  {
179  yaz_log(YLOG_WARN, "Wrapper element <%s> deprecated", xml_node->name);
180  yaz_log(YLOG_LOG, "Use <icu_chain id=\"%s\">.. only", xml_node->name);
181  xml_node = xml_node->children;
182  while (xml_node && xml_node->type != XML_ELEMENT_NODE)
183  xml_node = xml_node->next;
184  }
185  if (!xml_node)
186  {
187  yaz_log(YLOG_FATAL, "Missing icu_chain element");
188  return -1;
189  }
190  pct = pp2_charset_create_xml(xml_node);
191  if (!pct)
192  return -1;
193  if (!id)
194  {
195  id = yaz_xml_get_prop(xml_node, "id");
196  if (!id)
197  {
198  yaz_log(YLOG_WARN, "Missing id for icu_chain");
199  pp2_charset_destroy(pct);
200  return -1;
201  }
202  }
203  r = pp2_charset_fact_add(pft, pct, id);
204  return r;
205 }
206 
208 {
209  (pft->ref_count)++;
210 }
211 
212 pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node)
213 {
214 #if YAZ_HAVE_ICU
215  UErrorCode status = U_ZERO_ERROR;
216  struct icu_chain *chain = 0;
217  chain = icu_chain_xml_config(xml_node, 1, &status);
218  if (!chain || U_FAILURE(status))
219  {
220  yaz_log(YLOG_FATAL, "Could not parse ICU chain config:\n"
221  "<%s>\n ... \n</%s>",
222  xml_node->name, xml_node->name);
223  return 0;
224  }
225  return pp2_charset_create_icu(chain);
226 #else // YAZ_HAVE_ICU
227  yaz_log(YLOG_FATAL, "Error: ICU support requested with element:\n"
228  "<%s>\n ... \n</%s>",
229  xml_node->name, xml_node->name);
230  yaz_log(YLOG_FATAL,
231  "But no ICU support is compiled into the YAZ library.");
232  return 0;
233 #endif // YAZ_HAVE_ICU
234 }
235 
236 pp2_charset_t pp2_charset_create(void)
237 {
238  pp2_charset_t pct = xmalloc(sizeof(*pct));
239 
244 #if YAZ_HAVE_ICU
245  pct->icu_chn = 0;
246 #endif // YAZ_HAVE_ICU
247  return pct;
248 }
249 
250 pp2_charset_t pp2_charset_create_a_to_z(void)
251 {
252  pp2_charset_t pct = pp2_charset_create();
254  return pct;
255 }
256 
257 #if YAZ_HAVE_ICU
258 pp2_charset_t pp2_charset_create_icu(struct icu_chain *icu_chn)
259 {
260  pp2_charset_t pct = pp2_charset_create();
261  if (icu_chn)
262  {
263  pct->icu_chn = icu_chn;
264  pct->icu_sts = U_ZERO_ERROR;
265  pct->token_next_handler = pp2_charset_token_icu;
266  pct->get_sort_handler = pp2_get_sort_icu;
267  pct->get_display_handler = pp2_get_display_icu;
268  pct->get_org_handler = pp2_get_org_icu;
269  }
270  return pct;
271 }
272 #endif // YAZ_HAVE_ICU
273 
274 void pp2_charset_destroy(pp2_charset_t pct)
275 {
276 #if YAZ_HAVE_ICU
277  icu_chain_destroy(pct->icu_chn);
278 #endif
279  xfree(pct);
280 }
281 
283  const char *id)
284 {
285  struct pp2_charset_entry *pce;
286  for (pce = pft->list; pce; pce = pce->next)
287  if (!strcmp(id, pce->name))
288  return pp2_charset_tokenize(pce->pct);
289  return 0;
290 }
291 
293 {
294  pp2_charset_token_t prt = xmalloc(sizeof(*prt));
295 
296  assert(pct);
297 
298  prt->norm_str = wrbuf_alloc();
299  prt->sort_str = wrbuf_alloc();
300  prt->cp = 0;
301  prt->last_cp = 0;
302  prt->pct = pct;
303 
304 #if YAZ_HAVE_ICU
305  prt->iter = 0;
306  if (pct->icu_chn)
307  prt->iter = icu_iter_create(pct->icu_chn);
308 #endif
309  prt->start = 0;
310  prt->len = 0;
311  return prt;
312 }
313 
315  const char *buf, int skip_article)
316 {
317  if (skip_article)
318  {
319  const char *p = buf;
320  char firstword[64];
321  char *pout = firstword;
322  char articles[] = "the den der die des an a "; // must end in space
323 
324  for (; *p && *p != ' ' && pout - firstword < (sizeof(firstword)-2); p++)
325  *pout++ = tolower(*(unsigned char *)p);
326  *pout++ = ' ';
327  *pout++ = '\0';
328  if (strstr(articles, firstword))
329  buf = p;
330  }
331 
332  wrbuf_rewind(prt->norm_str);
333  wrbuf_rewind(prt->sort_str);
334  prt->cp0 = buf;
335  prt->cp = buf;
336  prt->last_cp = 0;
337 
338 #if YAZ_HAVE_ICU
339  if (prt->iter)
340  {
341  icu_iter_first(prt->iter, buf);
342  }
343 #endif // YAZ_HAVE_ICU
344 }
345 
347 {
348  assert(prt);
349 #if YAZ_HAVE_ICU
350  if (prt->iter)
351  icu_iter_destroy(prt->iter);
352 #endif
353  if(prt->norm_str)
354  wrbuf_destroy(prt->norm_str);
355  if(prt->sort_str)
356  wrbuf_destroy(prt->sort_str);
357  xfree(prt);
358 }
359 
361 {
362  assert(prt);
363  return (prt->pct->token_next_handler)(prt);
364 }
365 
367 {
368  return prt->pct->get_sort_handler(prt);
369 }
370 
372 {
373  return prt->pct->get_display_handler(prt);
374 }
375 
376 void pp2_get_org(pp2_charset_token_t prt, size_t *start, size_t *len)
377 {
378  prt->pct->get_org_handler(prt, start, len);
379 }
380 
381 
382 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) : -1)
383 /* original tokenizer with our tokenize interface, but we
384  add +1 to ensure no '\0' are in our string (except for EOF)
385 */
387 {
388  const char *cp = prt->cp;
389  int c;
390 
391  prt->start = cp - prt->cp0;
392  /* skip white space */
393  while (*cp && (c = raw_char(tolower(*(const unsigned char *)cp))) < 0)
394  cp++;
395  if (*cp == '\0')
396  {
397  prt->cp = cp;
398  prt->last_cp = 0;
399  return 0;
400  }
401  /* now read the term itself */
402 
403  prt->last_cp = cp;
404  wrbuf_rewind(prt->norm_str);
405  while (*cp && (c = raw_char(tolower(*cp))) >= 0)
406  {
407  wrbuf_putc(prt->norm_str, c);
408  cp++;
409  }
410  prt->len = (cp - prt->cp0) - prt->start;
411  prt->cp = cp;
412  return wrbuf_cstr(prt->norm_str);
413 }
414 
416 {
417  if (prt->last_cp == 0)
418  return 0;
419  else
420  {
421  char *tmp = xstrdup(prt->last_cp);
422  char *result = 0;
423  result = normalize7bit_mergekey(tmp);
424 
425  wrbuf_rewind(prt->sort_str);
426  wrbuf_puts(prt->sort_str, result);
427  xfree(tmp);
428  return wrbuf_cstr(prt->sort_str);
429  }
430 }
431 
433 {
434  if (prt->last_cp == 0)
435  return 0;
436  else
437  {
438  return wrbuf_cstr(prt->norm_str);
439  }
440 }
441 
443  size_t *start, size_t *len)
444 {
445  *start = prt->start;
446  *len = prt->len;
447 }
448 
450 {
451  const char *cp = prt->cp;
452 
453  prt->last_cp = *cp ? cp : 0;
454  while (*cp)
455  cp++;
456  prt->cp = cp;
457  prt->len = cp - prt->cp0;
458  return prt->last_cp;
459 }
460 
461 #if YAZ_HAVE_ICU
462 static const char *pp2_charset_token_icu(pp2_charset_token_t prt)
463 {
464  if (icu_iter_next(prt->iter))
465  {
466  return icu_iter_get_norm(prt->iter);
467  }
468  return 0;
469 }
470 
471 static const char *pp2_get_sort_icu(pp2_charset_token_t prt)
472 {
473  return icu_iter_get_sortkey(prt->iter);
474 }
475 
476 static const char *pp2_get_display_icu(pp2_charset_token_t prt)
477 {
478  return icu_iter_get_display(prt->iter);
479 }
480 
481 static void pp2_get_org_icu(pp2_charset_token_t prt, size_t *start, size_t *len)
482 {
483  icu_iter_get_org_info(prt->iter, start, len);
484 }
485 
486 #endif // YAZ_HAVE_ICU
487 
488 
489 /*
490  * Local variables:
491  * c-basic-offset: 4
492  * c-file-style: "Stroustrup"
493  * indent-tabs-mode: nil
494  * End:
495  * vim: shiftwidth=4 tabstop=8 expandtab
496  */
497 
static pp2_charset_t pp2_charset_create(void)
Definition: charsets.c:236
static pp2_charset_token_t pp2_charset_tokenize(pp2_charset_t pct)
Definition: charsets.c:292
void(* get_org_handler)(pp2_charset_token_t ptr, size_t *start, size_t *len)
Definition: charsets.c:57
const char * cp0
Definition: charsets.c:90
static const char * pp2_charset_token_a_to_z(pp2_charset_token_t prt)
Definition: charsets.c:386
static const char * pp2_charset_token_null(pp2_charset_token_t prt)
Definition: charsets.c:449
const char *(* get_sort_handler)(pp2_charset_token_t prt)
Definition: charsets.c:55
static void pp2_charset_destroy(pp2_charset_t pct)
Definition: charsets.c:274
void pp2_charset_fact_destroy(pp2_charset_fact_t pft)
Definition: charsets.c:123
const char * pp2_get_sort(pp2_charset_token_t prt)
Definition: charsets.c:366
char * name
Definition: charsets.c:103
int pp2_charset_fact_define(pp2_charset_fact_t pft, xmlNode *xml_node, const char *id)
Definition: charsets.c:169
pp2_charset_t pct
Definition: charsets.c:84
static pp2_charset_t pp2_charset_create_a_to_z(void)
Definition: charsets.c:250
struct pp2_charset_entry * list
Definition: charsets.c:96
char * normalize7bit_mergekey(char *buf)
Definition: normalize7bit.c:47
const char * pp2_get_display(pp2_charset_token_t prt)
Definition: charsets.c:371
Definition: charsets.c:100
void pp2_charset_fact_incref(pp2_charset_fact_t pft)
Definition: charsets.c:207
const char * last_cp
Definition: charsets.c:83
pp2_charset_fact_t pp2_charset_fact_create(void)
Definition: charsets.c:110
const char *(* token_next_handler)(pp2_charset_token_t prt)
Definition: charsets.c:54
static void pp2_get_org_ascii(pp2_charset_token_t prt, size_t *start, size_t *len)
Definition: charsets.c:442
const char * cp
Definition: charsets.c:82
pp2_charset_t pct
Definition: charsets.c:102
void pp2_get_org(pp2_charset_token_t prt, size_t *start, size_t *len)
Definition: charsets.c:376
const char * pp2_charset_token_next(pp2_charset_token_t prt)
Definition: charsets.c:360
void pp2_charset_token_first(pp2_charset_token_t prt, const char *buf, int skip_article)
Definition: charsets.c:314
struct pp2_charset_s * pp2_charset_t
Definition: charsets.c:40
void pp2_charset_token_destroy(pp2_charset_token_t prt)
Definition: charsets.c:346
static const char * pp2_get_sort_ascii(pp2_charset_token_t prt)
Definition: charsets.c:415
Pazpar2 Character set facilities.
pp2_charset_token_t pp2_charset_token_create(pp2_charset_fact_t pft, const char *id)
Definition: charsets.c:282
static int pp2_charset_fact_add(pp2_charset_fact_t pft, pp2_charset_t pct, const char *default_id)
Definition: charsets.c:145
struct pp2_charset_entry * next
Definition: charsets.c:101
#define raw_char(c)
Definition: charsets.c:382
static pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node)
Definition: charsets.c:212
static const char * pp2_get_display_ascii(pp2_charset_token_t prt)
Definition: charsets.c:432
const char *(* get_display_handler)(pp2_charset_token_t prt)
Definition: charsets.c:56