YAZ  5.23.1
icu_chain.c
Go to the documentation of this file.
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5 
11 #if HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14 
15 #if YAZ_HAVE_ICU
16 #include <yaz/xmalloc.h>
17 
18 #include <yaz/icu_I18N.h>
19 
20 #include <yaz/stemmer.h>
21 
22 #include <yaz/log.h>
23 #include <yaz/nmem.h>
24 #include <yaz/nmem_xml.h>
25 #include <yaz/xml_get.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <assert.h>
30 
31 #include <unicode/ustring.h> /* some more string fcns*/
32 #include <unicode/uchar.h> /* char names */
33 
34 enum icu_chain_step_type {
35  ICU_chain_step_type_none,
36  ICU_chain_step_type_display, /* convert to utf8 display format */
37  ICU_chain_step_type_casemap, /* apply utf16 charmap */
38  ICU_chain_step_type_transform, /* apply utf16 transform */
39  ICU_chain_step_type_tokenize, /* apply utf16 tokenization */
40  ICU_chain_step_type_transliterate, /* apply utf16 tokenization */
41  YAZ_chain_step_type_stemming, /* apply utf16 stemming (YAZ) */
42  ICU_chain_step_type_join
43 };
44 
45 struct icu_chain_step
46 {
47  /* type and action object */
48  enum icu_chain_step_type type;
49  union {
50  struct icu_casemap *casemap;
51  struct icu_transform *transform;
52  struct icu_tokenizer *tokenizer;
53  yaz_stemmer_p stemmer;
54  struct icu_buf_utf16 *join;
55  } u;
56  struct icu_chain_step *previous;
57 };
58 
59 struct icu_chain
60 {
61  yaz_icu_iter_t iter;
62  char *locale;
63  int sort;
64 
65  UCollator *coll;
66 
67  /* linked list of chain steps */
68  struct icu_chain_step *csteps;
69 };
70 
71 int icu_check_status(UErrorCode status)
72 {
73  if (U_FAILURE(status))
74  {
75  yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
76  return 0;
77  }
78  return 1;
79 }
80 
81 static struct icu_chain_step *icu_chain_insert_step(
82  struct icu_chain *chain, enum icu_chain_step_type type,
83  const char *rule, UErrorCode *status)
84 {
85  struct icu_chain_step *step = 0;
86 
87  assert(chain);
88  assert(type);
89 
90  step = (struct icu_chain_step *) xmalloc(sizeof(*step));
91  step->type = type;
92 
93  switch (step->type)
94  {
95  case ICU_chain_step_type_display:
96  break;
97  case ICU_chain_step_type_casemap:
98  assert(rule);
99  step->u.casemap = icu_casemap_create(rule[0], status);
100  break;
101  case ICU_chain_step_type_transform:
102  assert(rule);
103  /* rule omitted. Only ID used */
104  step->u.transform = icu_transform_create(rule, 'f', 0, status);
105  break;
106  case ICU_chain_step_type_tokenize:
107  assert(rule);
108  step->u.tokenizer = icu_tokenizer_create(chain->locale, rule[0], status);
109  break;
110  case ICU_chain_step_type_transliterate:
111  assert(rule);
112  /* we pass a dummy ID to utrans_openU.. */
113  step->u.transform = icu_transform_create("custom", 'f', rule, status);
114  break;
115  case YAZ_chain_step_type_stemming:
116  assert(rule);
117  step->u.stemmer = yaz_stemmer_create(chain->locale, rule, status);
118  break;
119  case ICU_chain_step_type_join:
120  assert(rule);
121  step->u.join = icu_buf_utf16_create(0);
122  icu_utf16_from_utf8_cstr(step->u.join, rule, status);
123  break;
124  default:
125  break;
126  }
127  step->previous = chain->csteps;
128  chain->csteps = step;
129 
130  return step;
131 }
132 
133 
134 static void icu_chain_step_destroy(struct icu_chain_step *step)
135 {
136  if (!step)
137  return;
138 
139  icu_chain_step_destroy(step->previous);
140 
141  switch (step->type)
142  {
143  case ICU_chain_step_type_display:
144  break;
145  case ICU_chain_step_type_casemap:
146  icu_casemap_destroy(step->u.casemap);
147  break;
148  case ICU_chain_step_type_transform:
149  case ICU_chain_step_type_transliterate:
150  icu_transform_destroy(step->u.transform);
151  break;
152  case ICU_chain_step_type_tokenize:
153  icu_tokenizer_destroy(step->u.tokenizer);
154  break;
155  case YAZ_chain_step_type_stemming:
156  yaz_stemmer_destroy(step->u.stemmer);
157  break;
158  case ICU_chain_step_type_join:
159  icu_buf_utf16_destroy(step->u.join);
160  break;
161  default:
162  break;
163  }
164  xfree(step);
165 }
166 
167 struct icu_chain_step *icu_chain_step_clone(struct icu_chain_step *old)
168 {
169  struct icu_chain_step *step = 0;
170  struct icu_chain_step **sp = &step;
171  while (old)
172  {
173  *sp = (struct icu_chain_step *) xmalloc(sizeof(**sp));
174  (*sp)->type = old->type;
175 
176  switch ((*sp)->type)
177  {
178  case ICU_chain_step_type_display:
179  break;
180  case ICU_chain_step_type_casemap:
181  (*sp)->u.casemap = icu_casemap_clone(old->u.casemap);
182  break;
183  case ICU_chain_step_type_transform:
184  case ICU_chain_step_type_transliterate:
185  (*sp)->u.transform = icu_transform_clone(old->u.transform);
186  break;
187  case ICU_chain_step_type_tokenize:
188  (*sp)->u.tokenizer = icu_tokenizer_clone(old->u.tokenizer);
189  break;
190  case YAZ_chain_step_type_stemming:
191  (*sp)->u.stemmer = yaz_stemmer_clone(old->u.stemmer);
192  break;
193  case ICU_chain_step_type_none:
194  break;
195  case ICU_chain_step_type_join:
196  (*sp)->u.join = icu_buf_utf16_create(0);
197  (*sp)->u.join = icu_buf_utf16_copy((*sp)->u.join, old->u.join);
198  break;
199  }
200  old = old->previous;
201  sp = &(*sp)->previous;
202  }
203  *sp = 0;
204  return step;
205 }
206 
207 struct icu_chain *icu_chain_create(const char *locale, int sort,
208  UErrorCode *status)
209 {
210  struct icu_chain *chain;
211  UCollator *coll = ucol_open(locale, status);
212 
213  if (U_FAILURE(*status))
214  return 0;
215 
216  chain = (struct icu_chain *) xmalloc(sizeof(*chain));
217  chain->iter = 0;
218  chain->locale = xstrdup(locale);
219  chain->sort = sort;
220  chain->coll = coll;
221  chain->csteps = 0;
222 
223  return chain;
224 }
225 
226 void icu_chain_destroy(struct icu_chain *chain)
227 {
228  if (chain)
229  {
230  if (chain->coll)
231  ucol_close(chain->coll);
232 
233  if (chain->iter)
234  icu_iter_destroy(chain->iter);
235  icu_chain_step_destroy(chain->csteps);
236  xfree(chain->locale);
237  xfree(chain);
238  }
239 }
240 
241 struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node,
242  int sort,
243  UErrorCode *status)
244 {
245  xmlNode *node = 0;
246  int no_errors = 0;
247  struct icu_chain *chain = 0;
248  NMEM nmem = 0;
249 
250  *status = U_ZERO_ERROR;
251 
252  if (xml_node && xml_node->type == XML_ELEMENT_NODE)
253  {
254  const char *xml_locale = yaz_xml_get_prop((xmlNode *) xml_node,
255  "locale");
256  if (xml_locale)
257  chain = icu_chain_create((const char *) xml_locale, sort, status);
258  }
259 
260  if (!chain)
261  return 0;
262 
263  nmem = nmem_create();
264  for (node = xml_node->children; node; node = node->next)
265  {
266  char *rule = 0;
267  struct icu_chain_step *step = 0;
268  const char *attr_str;
269 
270  nmem_reset(nmem);
271  if (node->type != XML_ELEMENT_NODE)
272  continue;
273  attr_str = yaz_xml_get_prop(node, "rule%s", &rule);
274  if (attr_str)
275  {
276  yaz_log(YLOG_WARN, "Unsupported attribute '%s' for "
277  "element '%s'", attr_str, node->name);
278  no_errors++;
279  }
280  if (!rule && node->children)
281  rule = nmem_text_node_cdata(node->children, nmem);
282 
283  if (!rule && strcmp((const char *) node->name, "display"))
284  {
285  yaz_log(YLOG_WARN, "Missing attribute 'rule' for element %s",
286  (const char *) node->name);
287  no_errors++;
288  continue;
289  }
290  if (!strcmp((const char *) node->name, "casemap"))
291  step = icu_chain_insert_step(chain,
292  ICU_chain_step_type_casemap,
293  rule, status);
294  else if (!strcmp((const char *) node->name, "transform"))
295  step = icu_chain_insert_step(chain,
296  ICU_chain_step_type_transform,
297  rule, status);
298  else if (!strcmp((const char *) node->name, "transliterate"))
299  step = icu_chain_insert_step(chain,
300  ICU_chain_step_type_transliterate,
301  rule, status);
302  else if (!strcmp((const char *) node->name, "tokenize"))
303  step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
304  rule, status);
305  else if (!strcmp((const char *) node->name, "display"))
306  step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
307  rule, status);
308  else if (!strcmp((const char *) node->name, "stemming"))
309  step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming,
310  rule, status);
311  else if (!strcmp((const char *) node->name, "join"))
312  step = icu_chain_insert_step(chain, ICU_chain_step_type_join,
313  rule, status);
314  else if (!strcmp((const char *) node->name, "normalize"))
315  {
316  yaz_log(YLOG_WARN, "Element %s is deprecated. "
317  "Use transform instead", node->name);
318  step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
319  rule, status);
320  }
321  else if (!strcmp((const char *) node->name, "index")
322  || !strcmp((const char *) node->name, "sortkey"))
323  {
324  yaz_log(YLOG_WARN, "Element %s is no longer needed. "
325  "Remove it from the configuration", node->name);
326  }
327  else
328  {
329  yaz_log(YLOG_WARN, "Unknown element %s", node->name);
330  no_errors++;
331  continue;
332  }
333  if (!step)
334  {
335  yaz_log(YLOG_WARN, "Step not created for %s", node->name);
336  no_errors++;
337  }
338  if (step && U_FAILURE(*status))
339  {
340  yaz_log(YLOG_WARN, "ICU Error %d %s for element %s, rule %s",
341  *status, u_errorName(*status), node->name, rule ?
342  rule : "");
343  no_errors++;
344  break;
345  }
346  }
347  nmem_destroy(nmem);
348  if (no_errors)
349  {
350  icu_chain_destroy(chain);
351  return 0;
352  }
353  return chain;
354 }
355 
356 struct icu_iter {
357  struct icu_chain *chain;
358  struct icu_buf_utf16 *last;
359  struct icu_buf_utf16 *org;
360  struct icu_buf_utf8 *org8;
361  UErrorCode status;
362  struct icu_buf_utf8 *display;
363  struct icu_buf_utf8 *sort8;
364  struct icu_buf_utf8 *result;
365  int token_count;
366  size_t org_start;
367  size_t org_len;
368  size_t utf8_base;
369  size_t utf16_base;
370  struct icu_chain_step *steps;
371 };
372 
373 void icu_utf16_print(struct icu_buf_utf16 *src16)
374 {
375  UErrorCode status = U_ZERO_ERROR;
376  const char *p;
377  struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
378  icu_utf16_to_utf8(dst8, src16, &status);
379 
380  if (U_FAILURE(status))
381  {
382  printf("failure");
383  }
384  else
385  {
386  p = icu_buf_utf8_to_cstr(dst8);
387  printf("%s", p);
388  }
389  icu_buf_utf8_destroy(dst8);
390 }
391 
392 struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
393  struct icu_chain_step *step,
394  struct icu_buf_utf16 *src)
395 {
396  if (!step)
397  return src;
398  else
399  {
400  struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
401 
402  switch (step->type)
403  {
404  case ICU_chain_step_type_casemap:
405  if (dst)
406  {
407  struct icu_buf_utf16 *src = dst;
408 
409  dst = icu_buf_utf16_create(0);
410  icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
411  iter->chain->locale);
413  }
414  break;
415  case ICU_chain_step_type_tokenize:
416  if (dst)
417  {
418  struct icu_buf_utf16 *src = dst;
419 
420  icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
421  if (step->previous)
422  { /* no need to copy if it's already the same */
423  iter->utf8_base = iter->utf16_base = 0;
424  icu_buf_utf16_copy(iter->org, src);
425  }
427  }
428  dst = icu_buf_utf16_create(0);
429  iter->status = U_ZERO_ERROR;
430  if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status,
431  &iter->org_start, &iter->org_len))
432  {
434  dst = 0;
435  }
436  break;
437  case ICU_chain_step_type_transform:
438  case ICU_chain_step_type_transliterate:
439  if (dst)
440  {
441  struct icu_buf_utf16 *src = dst;
442  dst = icu_buf_utf16_create(0);
443  icu_transform_trans(step->u.transform, dst, src, &iter->status);
445  }
446  break;
447  case ICU_chain_step_type_display:
448  if (dst)
449  icu_utf16_to_utf8(iter->display, dst, &iter->status);
450  break;
451  case YAZ_chain_step_type_stemming:
452  if (dst)
453  {
454  struct icu_buf_utf16 *src = dst;
455  dst = icu_buf_utf16_create(0);
456  yaz_stemmer_stem(step->u.stemmer, dst, src, &iter->status);
458  }
459  break;
460  case ICU_chain_step_type_join:
461  if (dst)
462  {
463  while (1)
464  {
465  struct icu_buf_utf16 *dst1 =
466  icu_iter_invoke(iter, step->previous, 0);
467 
468  if (!dst1)
469  break;
470  dst = icu_buf_utf16_append(dst, step->u.join);
471  dst = icu_buf_utf16_append(dst, dst1);
472  icu_buf_utf16_destroy(dst1);
473  }
474  }
475  break;
476  default:
477  assert(0);
478  }
479  return dst;
480  }
481 }
482 
483 yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
484 {
485  yaz_icu_iter_t iter = xmalloc(sizeof(*iter));
486  iter->chain = chain;
487  iter->status = U_ZERO_ERROR;
488  iter->display = icu_buf_utf8_create(0);
489  iter->sort8 = icu_buf_utf8_create(0);
490  iter->result = icu_buf_utf8_create(0);
491  iter->org = icu_buf_utf16_create(0);
492  iter->org8 = 0;
493  iter->last = 0; /* no last returned string (yet) */
494  iter->steps = icu_chain_step_clone(chain->csteps);
495  iter->token_count = 0;
496 
497  return iter;
498 }
499 
500 void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
501 {
502  struct icu_buf_utf16 *src = icu_buf_utf16_create(0);
503  icu_utf16_from_utf8_cstr(src, src8cstr, &iter->status);
504  icu_buf_utf16_copy(iter->org, src);
505  iter->token_count = 0;
506  iter->org_start = 0;
507  iter->utf8_base = iter->utf16_base = 0;
508  iter->org_len = src->utf16_len;
509  iter->last = icu_iter_invoke(iter, iter->steps, src);
510 }
511 
513 {
514  if (iter)
515  {
516  icu_buf_utf8_destroy(iter->display);
517  icu_buf_utf8_destroy(iter->sort8);
518  icu_buf_utf8_destroy(iter->result);
519  icu_buf_utf16_destroy(iter->org);
520  icu_buf_utf8_destroy(iter->org8);
521  icu_chain_step_destroy(iter->steps);
522  xfree(iter);
523  }
524 }
525 
527 {
528  if (iter->token_count && iter->last)
529  iter->last = icu_iter_invoke(iter, iter->steps, 0);
530  if (!iter->last)
531  return 0;
532  else
533  {
534  iter->token_count++;
535  if (iter->chain->sort)
536  {
537  icu_sortkey8_from_utf16(iter->chain->coll,
538  iter->sort8, iter->last,
539  &iter->status);
540  }
541  icu_utf16_to_utf8(iter->result, iter->last, &iter->status);
542  icu_buf_utf16_destroy(iter->last);
543 
544  return 1;
545  }
546 }
547 
548 const char *icu_iter_get_norm(yaz_icu_iter_t iter)
549 {
550  return icu_buf_utf8_to_cstr(iter->result);
551 }
552 
553 const char *icu_iter_get_sortkey(yaz_icu_iter_t iter)
554 {
555  return icu_buf_utf8_to_cstr(iter->sort8);
556 }
557 
558 const char *icu_iter_get_display(yaz_icu_iter_t iter)
559 {
560  return icu_buf_utf8_to_cstr(iter->display);
561 }
562 
564 {
565  return iter->token_count;
566 }
567 
568 
569 void icu_iter_get_org_info2(yaz_icu_iter_t iter, size_t *start, size_t *len,
570  const char **cstr)
571 {
572  int32_t len1 = 0, len2 = 0;
573  UErrorCode status = U_ZERO_ERROR;
574 
575  if (iter->org_start < iter->utf16_base)
576  {
577  iter->utf8_base = 0;
578  iter->utf16_base = 0;
579  }
580  u_strToUTF8(0, 0, &len1,
581  iter->org->utf16 + iter->utf16_base,
582  iter->org_start - iter->utf16_base,
583  &status);
584 
585  status = U_ZERO_ERROR;
586 
587  *start = len1 + iter->utf8_base;
588 
589  u_strToUTF8(0, 0, &len2,
590  iter->org->utf16 + iter->utf16_base,
591  iter->org_start - iter->utf16_base + iter->org_len,
592  &status);
593 
594  *len = len2 - len1;
595 
596  if (cstr)
597  {
598  if (!iter->org8)
599  iter->org8 = icu_buf_utf8_create(0);
600  status = U_ZERO_ERROR;
601  icu_utf16_to_utf8(iter->org8, iter->org, &status);
602  *cstr = icu_buf_utf8_to_cstr(iter->org8);
603  }
604  iter->utf8_base = *start;
605  iter->utf16_base = iter->org_start;
606 }
607 
608 void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
609 {
610  icu_iter_get_org_info2(iter, start, len, 0);
611 }
612 
613 int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
614  UErrorCode *status)
615 {
616  if (chain->iter)
617  icu_iter_destroy(chain->iter);
618  chain->iter = icu_iter_create(chain);
619  icu_iter_first(chain->iter, src8cstr);
620  return 1;
621 }
622 
623 int icu_chain_next_token(struct icu_chain *chain, UErrorCode *status)
624 {
625  *status = U_ZERO_ERROR;
626  return icu_iter_next(chain->iter);
627 }
628 
629 int icu_chain_token_number(struct icu_chain *chain)
630 {
631  if (chain && chain->iter)
632  return chain->iter->token_count;
633  return 0;
634 }
635 
636 const char *icu_chain_token_display(struct icu_chain *chain)
637 {
638  if (chain->iter)
639  return icu_iter_get_display(chain->iter);
640  return 0;
641 }
642 
643 const char *icu_chain_token_norm(struct icu_chain *chain)
644 {
645  if (chain->iter)
646  return icu_iter_get_norm(chain->iter);
647  return 0;
648 }
649 
650 const char *icu_chain_token_sortkey(struct icu_chain *chain)
651 {
652  if (chain->iter)
653  return icu_iter_get_sortkey(chain->iter);
654  return 0;
655 }
656 
657 void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len)
658 {
659  if (chain->iter)
660  icu_iter_get_org_info(chain->iter, start, len);
661 }
662 
663 void icu_chain_get_org_info2(struct icu_chain *chain, size_t *start,
664  size_t *len, const char **cstr)
665 {
666  if (chain->iter)
667  icu_iter_get_org_info2(chain->iter, start, len, cstr);
668 }
669 
670 
671 #endif /* YAZ_HAVE_ICU */
672 
673 /*
674  * Local variables:
675  * c-basic-offset: 4
676  * c-file-style: "Stroustrup"
677  * indent-tabs-mode: nil
678  * End:
679  * vim: shiftwidth=4 tabstop=8 expandtab
680  */
681 
void icu_transform_destroy(struct icu_transform *transform)
int icu_chain_token_number(yaz_icu_chain_t chain)
returns token number of last token processed
Header for the stemming API.
int icu_check_status(UErrorCode status)
Header for Nibble Memory functions + Libxml2 specific stuff.
int icu_chain_next_token(yaz_icu_chain_t chain, UErrorCode *status)
returns one token (if any)
const char * yaz_xml_get_prop(const xmlNode *n, const char *fmt,...)
Definition: xml_get.c:19
struct icu_casemap * icu_casemap_clone(struct icu_casemap *old)
static int node(struct cql_node *cn, void(*pr)(const char *buf, void *client_data), void *client_data)
Definition: cql2ccl.c:86
void icu_casemap_destroy(struct icu_casemap *casemap)
int icu_iter_get_token_number(yaz_icu_iter_t iter)
returns ICU token count for iterator
int icu_iter_next(yaz_icu_iter_t iter)
iterates over one token
void nmem_destroy(NMEM n)
destroys NMEM handle and memory associated with it
Definition: nmem.c:204
void icu_chain_get_org_info2(yaz_icu_chain_t chain, size_t *start, size_t *len, const char **cstr)
returns token as it relates to original text (2nd version)
struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, UErrorCode *status)
int icu_chain_assign_cstr(yaz_icu_chain_t chain, const char *src8cstr, UErrorCode *status)
pass string to ICU for parsing/tokenization/etc
void nmem_reset(NMEM n)
releases memory associaged with an NMEM handle
Definition: nmem.c:129
#define xstrdup(s)
utility macro which calls xstrdup_f
Definition: xmalloc.h:55
const char * icu_iter_get_sortkey(yaz_icu_iter_t iter)
returns ICU sortkey string
const char * icu_iter_get_display(yaz_icu_iter_t iter)
returns ICU display string
yaz_icu_chain_t icu_chain_create(const char *locale, int sort, UErrorCode *status)
int icu_casemap_casemap(struct icu_casemap *casemap, struct icu_buf_utf16 *dest16, struct icu_buf_utf16 *src16, UErrorCode *status, const char *locale)
void icu_chain_get_org_info(yaz_icu_chain_t chain, size_t *start, size_t *len)
returns token as it relates to original text (legacy)
int icu_tokenizer_attach(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *src16, UErrorCode *status)
XML node getter/creation utilities.
const char * icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
starts iteration over string
const char * icu_iter_get_norm(yaz_icu_iter_t iter)
returns ICU normalized token
UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 *dest8, const struct icu_buf_utf16 *src16, UErrorCode *status)
Internal header for ICU utilities.
#define xfree(x)
utility macro which calls xfree_f
Definition: xmalloc.h:53
void icu_iter_destroy(yaz_icu_iter_t iter)
destroy ICU tokenizer iterator
enum l_file_type type
Definition: log.c:45
struct icu_casemap * icu_casemap_create(char action, UErrorCode *status)
yaz_icu_chain_t icu_chain_xml_config(const xmlNode *xml_node, int sort, UErrorCode *status)
constructs ICU chain from XML specification
void icu_iter_get_org_info2(yaz_icu_iter_t iter, size_t *start, size_t *len, const char **cstr)
returns ICU original token start (offset) and length
void icu_chain_destroy(yaz_icu_chain_t chain)
destroys ICU chain
void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
returns ICU original token start (offset) and length (legacy)
struct icu_transform * icu_transform_clone(struct icu_transform *old)
yaz_stemmer_p yaz_stemmer_clone(yaz_stemmer_p stemmer)
void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16 *src, UErrorCode *status)
struct icu_transform * icu_transform_create(const char *id, char action, const char *rules, UErrorCode *status)
int32_t utf16_len
Definition: icu_I18N.h:55
void icu_buf_utf16_destroy(struct icu_buf_utf16 *buf16)
const char * icu_chain_token_norm(yaz_icu_chain_t chain)
returns normalized token of last token processed
void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *tkn16, UErrorCode *status, size_t *start, size_t *len)
void icu_sortkey8_from_utf16(UCollator *coll, struct icu_buf_utf8 *dest8, struct icu_buf_utf16 *src16, UErrorCode *status)
struct icu_tokenizer * icu_tokenizer_clone(struct icu_tokenizer *old)
char * nmem_text_node_cdata(const xmlNode *ptr_cdata, NMEM nmem)
copies TEXT Libxml2 node data to NMEM
Definition: nmemsdup.c:145
struct icu_iter * yaz_icu_iter_t
ICU tokenizer iterator type (opaque)
Definition: icu.h:131
#define xmalloc(x)
utility macro which calls malloc_f
Definition: xmalloc.h:49
struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
Header for memory handling functions.
Header for Nibble Memory functions.
NMEM nmem_create(void)
returns new NMEM handle
Definition: nmem.c:181
int icu_transform_trans(struct icu_transform *transform, struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16, UErrorCode *status)
#define YLOG_WARN
log level: warning
Definition: log.h:46
void yaz_log(int level, const char *fmt,...)
Writes log message.
Definition: log.c:485
yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
create ICU tokenizer iterator from chain
struct icu_buf_utf16 * icu_buf_utf16_append(struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16)
Logging utility.
void yaz_stemmer_destroy(yaz_stemmer_p stemmer)
yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status)
struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16)
void icu_buf_utf8_destroy(struct icu_buf_utf8 *buf8)
const char * icu_chain_token_display(yaz_icu_chain_t chain)
returns display token of last token processed
UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 *dest16, const char *src8cstr, UErrorCode *status)
struct yaz_stemmer_t * yaz_stemmer_p
Definition: stemmer.h:49
const char * icu_chain_token_sortkey(yaz_icu_chain_t chain)
returns sortkey token of last token processed
struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)