IDZEBRA  2.2.7
mod_grs_xml.c
Go to the documentation of this file.
1 /* This file is part of the Zebra server.
2  Copyright (C) Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
18 */
19 
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 #if HAVE_EXPAT_H
24 
25 #include <assert.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #if HAVE_ICONV_H
29 #include <errno.h>
30 #include <iconv.h>
31 #endif
32 
33 #include <yaz/log.h>
34 #include <yaz/snprintf.h>
35 #include <yaz/log.h>
36 #include <yaz/xmalloc.h>
37 #include <idzebra/recgrs.h>
38 
39 #include <expat.h>
40 
41 #define XML_CHUNK 1024
42 
43 struct user_info {
44  data1_node *d1_stack[256];
45  int level;
46  data1_handle dh;
47  NMEM nmem;
48  int loglevel;
49 };
50 
51 static void report_xml_error(XML_Parser parser)
52 {
53  zint line = XML_GetCurrentLineNumber(parser);
54  zint col = XML_GetCurrentColumnNumber(parser);
55  yaz_log(YLOG_WARN, ZINT_FORMAT ":" ZINT_FORMAT ":XML error: %s",
56  line, col, XML_ErrorString(XML_GetErrorCode(parser)));
57 }
58 
59 static void cb_start(void *user, const char *el, const char **attr)
60 {
61  struct user_info *ui = (struct user_info*) user;
62  if (ui->level == 1)
63  data1_set_root (ui->dh, ui->d1_stack[0], ui->nmem, el);
64  ui->d1_stack[ui->level] = data1_mk_tag(ui->dh, ui->nmem, el, attr,
65  ui->d1_stack[ui->level-1]);
66  ui->level++;
67  yaz_log (ui->loglevel, "cb_start %s", el);
68 }
69 
70 static void cb_end (void *user, const char *el)
71 {
72  struct user_info *ui = (struct user_info*) user;
73 
74  ui->level--;
75  yaz_log(ui->loglevel, "cb_end %s", el);
76 }
77 
78 static void cb_chardata(void *user, const char *s, int len)
79 {
80  struct user_info *ui = (struct user_info*) user;
81 #if 0
82  yaz_log (ui->loglevel, "cb_chardata %.*s", len, s);
83 #endif
84  ui->d1_stack[ui->level] = data1_mk_text_n(ui->dh, ui->nmem, s, len,
85  ui->d1_stack[ui->level -1]);
86 }
87 
88 static void cb_decl(void *user, const char *version, const char *encoding,
89  int standalone)
90 {
91  struct user_info *ui = (struct user_info*) user;
92  const char *attr_list[7];
93 
94  attr_list[0] = "version";
95  attr_list[1] = version;
96 
97  attr_list[2] = "encoding";
98  attr_list[3] = "UTF-8"; /* internally it's always UTF-8 */
99 
100  attr_list[4] = "standalone";
101  attr_list[5] = standalone ? "yes" : "no";
102 
103  attr_list[6] = 0;
104 
105  data1_mk_preprocess(ui->dh, ui->nmem, "xml", attr_list,
106  ui->d1_stack[ui->level-1]);
107 #if 0
108  yaz_log (YLOG_LOG, "decl version=%s encoding=%s",
109  version ? version : "null",
110  encoding ? encoding : "null");
111 #endif
112 }
113 
114 static void cb_processing(void *user, const char *target,
115  const char *data)
116 {
117  struct user_info *ui = (struct user_info*) user;
118  data1_node *res =
119  data1_mk_preprocess(ui->dh, ui->nmem, target, 0,
120  ui->d1_stack[ui->level-1]);
121  data1_mk_text_nf(ui->dh, ui->nmem, data, strlen(data), res);
122 
123  yaz_log(ui->loglevel, "decl processing target=%s data=%s",
124  target ? target : "null",
125  data ? data : "null");
126 }
127 
128 static void cb_comment(void *user, const char *data)
129 {
130  struct user_info *ui = (struct user_info*) user;
131  yaz_log(ui->loglevel, "decl comment data=%s", data ? data : "null");
132  data1_mk_comment(ui->dh, ui->nmem, data, ui->d1_stack[ui->level-1]);
133 }
134 
135 static void cb_doctype_start(void *userData, const char *doctypeName,
136  const char *sysid, const char *pubid,
137  int has_internal_subset)
138 {
139  struct user_info *ui = (struct user_info*) userData;
140  yaz_log(ui->loglevel, "doctype start doctype=%s sysid=%s pubid=%s",
141  doctypeName, sysid, pubid);
142 }
143 
144 static void cb_doctype_end(void *userData)
145 {
146  struct user_info *ui = (struct user_info*) userData;
147  yaz_log(ui->loglevel, "doctype end");
148 }
149 
150 
151 static void cb_entity_decl(void *userData, const char *entityName,
152  int is_parameter_entity,
153  const char *value, int value_length,
154  const char *base, const char *systemId,
155  const char *publicId, const char *notationName)
156 {
157  struct user_info *ui = (struct user_info*) userData;
158  yaz_log(ui->loglevel,
159  "entity decl %s is_para_entry=%d value=%.*s base=%s systemId=%s"
160  " publicId=%s notationName=%s",
161  entityName, is_parameter_entity, value_length, value,
162  base, systemId, publicId, notationName);
163 
164 }
165 
166 static int cb_external_entity(XML_Parser pparser,
167  const char *context,
168  const char *base,
169  const char *systemId,
170  const char *publicId)
171 {
172  struct user_info *ui = (struct user_info*) XML_GetUserData(pparser);
173  FILE *inf;
174  int done = 0;
175  XML_Parser parser;
176 
177  yaz_log(ui->loglevel,
178  "external entity context=%s base=%s systemid=%s publicid=%s",
179  context, base, systemId, publicId);
180  if (!systemId)
181  return 1;
182 
183  if (!(inf = fopen(systemId, "rb")))
184  {
185  yaz_log (YLOG_WARN|YLOG_ERRNO, "fopen %s", systemId);
186  return 0;
187  }
188 
189  parser = XML_ExternalEntityParserCreate(pparser, "", 0);
190  while (!done)
191  {
192  int r;
193  void *buf = XML_GetBuffer(parser, XML_CHUNK);
194  if (!buf)
195  {
196  yaz_log(YLOG_WARN, "XML_GetBuffer fail");
197  break;
198  }
199  r = fread(buf, 1, XML_CHUNK, inf);
200  if (r == 0)
201  {
202  if (ferror(inf))
203  {
204  yaz_log(YLOG_WARN|YLOG_ERRNO, "fread %s", systemId);
205  break;
206  }
207  done = 1;
208  }
209  if (!XML_ParseBuffer(parser, r, done))
210  {
211  done = 1;
212  report_xml_error(parser);
213  }
214  }
215  fclose (inf);
216  XML_ParserFree(parser);
217  return done;
218 }
219 
220 
221 #if HAVE_ICONV_H
222 static int cb_encoding_convert(void *data, const char *s)
223 {
224  iconv_t t = (iconv_t) data;
225  size_t ret;
226  size_t outleft = 2;
227  char outbuf_[2], *outbuf = outbuf_;
228  size_t inleft = 4;
229  char *inbuf = (char *) s;
230  unsigned short code;
231 
232 #if 1
233  yaz_log(YLOG_LOG, "------------------------- cb_encoding_convert --- ");
234 #endif
235  ret = iconv(t, &inbuf, &inleft, &outbuf, &outleft);
236  if (ret == (size_t) (-1) && errno != E2BIG)
237  {
238  iconv (t, 0, 0, 0, 0);
239  return -1;
240  }
241  if (outleft != 0)
242  return -1;
243  memcpy (&code, outbuf_, sizeof(short));
244  return code;
245 }
246 
247 static void cb_encoding_release(void *data)
248 {
249  iconv_t t = (iconv_t) data;
250  iconv_close (t);
251 }
252 
253 static int cb_encoding_handler(void *userData, const char *name,
254  XML_Encoding *info)
255 {
256  int i = 0;
257  int no_ok = 0;
258  struct user_info *ui = (struct user_info*) userData;
259 
260  iconv_t t = iconv_open("UNICODE", name);
261  if (t == (iconv_t) (-1))
262  return 0;
263 
264  info->data = 0; /* signal that multibyte is not in use */
265  yaz_log(ui->loglevel, "Encoding handler of %s", name);
266  for (i = 0; i<256; i++)
267  {
268  size_t ret;
269  char outbuf_[5];
270  char inbuf_[5];
271  char *inbuf = inbuf_;
272  char *outbuf = outbuf_;
273  size_t inleft = 1;
274  size_t outleft = 2;
275  inbuf_[0] = i;
276 
277  iconv (t, 0, 0, 0, 0); /* reset iconv */
278 
279  ret = iconv(t, &inbuf, &inleft, &outbuf, &outleft);
280  if (ret == (size_t) (-1))
281  {
282  if (errno == EILSEQ)
283  {
284  yaz_log(ui->loglevel, "Encoding %d: invalid sequence", i);
285  info->map[i] = -1; /* invalid sequence */
286  }
287  if (errno == EINVAL)
288  { /* multi byte input */
289  int len = 2;
290  int j = 0;
291  info->map[i] = -1;
292 
293  while (len <= 4)
294  {
295  inbuf = inbuf_;
296  inleft = len;
297  outbuf = outbuf_;
298  outleft = 2;
299 
300  inbuf_[len-1] = j;
301  iconv (t, 0,0,0,0);
302 
303  assert (i >= 0 && i<255);
304 
305  ret = iconv(t, &inbuf, &inleft, &outbuf, &outleft);
306  if (ret == (size_t) (-1))
307  {
308  if (errno == EILSEQ || errno == E2BIG)
309  {
310  j++;
311  if (j > 255)
312  break;
313  }
314  else if (errno == EINVAL)
315  {
316  len++;
317  j = 7;
318  }
319  }
320  else if (outleft == 0)
321  {
322  info->map[i] = -len;
323  info->data = t; /* signal that multibyte is in use */
324  break;
325  }
326  else
327  {
328  break;
329  }
330  }
331  if (info->map[i] < -1)
332  yaz_log(ui->loglevel, "Encoding %d: multibyte input %d",
333  i, -info->map[i]);
334  else
335  yaz_log(ui->loglevel, "Encoding %d: multibyte input failed",
336  i);
337  }
338  if (errno == E2BIG)
339  {
340  info->map[i] = -1; /* no room for output */
341  if (i != 0)
342  yaz_log(YLOG_WARN, "Encoding %d: no room for output",
343  i);
344  }
345  }
346  else if (outleft == 0)
347  {
348  unsigned short code;
349  memcpy (&code, outbuf_, sizeof(short));
350  info->map[i] = code;
351  no_ok++;
352  }
353  else
354  { /* should never happen */
355  info->map[i] = -1;
356  yaz_log (YLOG_DEBUG, "Encoding %d: bad state", i);
357  }
358  }
359  if (info->data)
360  { /* at least one multi byte */
361  info->convert = cb_encoding_convert;
362  info->release = cb_encoding_release;
363  }
364  else
365  {
366  /* no multi byte - we no longer need iconv handler */
367  iconv_close(t);
368  info->convert = 0;
369  info->release = 0;
370  }
371  if (!no_ok)
372  return 0;
373  return 1;
374 }
375 /* HAVE_ICONV_H */
376 #endif
377 
378 static void cb_ns_start(void *userData, const char *prefix, const char *uri)
379 {
380  struct user_info *ui = (struct user_info*) userData;
381  if (prefix && uri)
382  yaz_log(ui->loglevel, "cb_ns_start %s %s", prefix, uri);
383 }
384 
385 static void cb_ns_end(void *userData, const char *prefix)
386 {
387  struct user_info *ui = (struct user_info*) userData;
388  if (prefix)
389  yaz_log(ui->loglevel, "cb_ns_end %s", prefix);
390 }
391 
392 data1_node *zebra_read_xml(data1_handle dh,
393  struct ZebraRecStream *stream,
394  NMEM m)
395 {
396  XML_Parser parser;
397  struct user_info uinfo;
398  int done = 0;
399  data1_node *first_node;
400  int no_read = 0;
401 
402  uinfo.loglevel = YLOG_DEBUG;
403  uinfo.level = 1;
404  uinfo.dh = dh;
405  uinfo.nmem = m;
406  uinfo.d1_stack[0] = data1_mk_node2 (dh, m, DATA1N_root, 0);
407  uinfo.d1_stack[1] = 0; /* indicate no children (see end of routine) */
408 
409  parser = XML_ParserCreate (0 /* encoding */);
410 
411  XML_SetElementHandler(parser, cb_start, cb_end);
412  XML_SetCharacterDataHandler(parser, cb_chardata);
413  XML_SetXmlDeclHandler(parser, cb_decl);
414  XML_SetProcessingInstructionHandler(parser, cb_processing);
415  XML_SetUserData(parser, &uinfo);
416  XML_SetCommentHandler(parser, cb_comment);
417  XML_SetDoctypeDeclHandler(parser, cb_doctype_start, cb_doctype_end);
418  XML_SetEntityDeclHandler(parser, cb_entity_decl);
419  XML_SetExternalEntityRefHandler(parser, cb_external_entity);
420  XML_SetNamespaceDeclHandler(parser, cb_ns_start, cb_ns_end);
421 #if HAVE_ICONV_H
422  XML_SetUnknownEncodingHandler(parser, cb_encoding_handler, &uinfo);
423 #endif
424  while (!done)
425  {
426  int r;
427  void *buf = XML_GetBuffer(parser, XML_CHUNK);
428  if (!buf)
429  {
430  /* error */
431  yaz_log(YLOG_WARN, "XML_GetBuffer fail");
432  break;
433  }
434  r = stream->readf(stream, buf, XML_CHUNK);
435  if (r < 0)
436  {
437  /* error */
438  yaz_log(YLOG_WARN, "XML read fail");
439  break;
440  }
441  else if (r == 0)
442  done = 1;
443  else
444  no_read += r;
445  if (no_read && !XML_ParseBuffer(parser, r, done))
446  {
447  done = 1;
448  report_xml_error(parser);
449  }
450  }
451  XML_ParserFree(parser);
452  if (no_read == 0)
453  return 0;
454  if (!uinfo.d1_stack[1] || !done)
455  return 0;
456  /* insert XML header if not present .. */
457  first_node = uinfo.d1_stack[0]->child;
458  if (first_node->which != DATA1N_preprocess ||
459  strcmp(first_node->u.preprocess.target, "xml"))
460  {
461  const char *attr_list[5];
462 
463  attr_list[0] = "version";
464  attr_list[1] = "1.0";
465 
466  attr_list[2] = "encoding";
467  attr_list[3] = "UTF-8"; /* encoding */
468 
469  attr_list[4] = 0;
470 
471  data1_insert_preprocess(uinfo.dh, uinfo.nmem, "xml", attr_list,
472  uinfo.d1_stack[0]);
473  }
474  return uinfo.d1_stack[0];
475 }
476 
477 struct xml_info {
478  XML_Expat_Version expat_version;
479 };
480 
481 static data1_node *grs_read_xml(struct grs_read_info *p)
482 {
483  return zebra_read_xml(p->dh, p->stream, p->mem);
484 }
485 
486 static void *filter_init(Res res, RecType recType)
487 {
488  struct xml_info *p = (struct xml_info *) xmalloc (sizeof(*p));
489 
490  p->expat_version = XML_ExpatVersionInfo();
491 
492  return p;
493 }
494 
495 static void filter_destroy(void *clientData)
496 {
497  struct xml_info *p = (struct xml_info *) clientData;
498 
499  xfree (p);
500 }
501 
502 static int filter_extract(void *clientData, struct recExtractCtrl *ctrl)
503 {
504  return zebra_grs_extract(clientData, ctrl, grs_read_xml);
505 }
506 
507 static int filter_retrieve(void *clientData, struct recRetrieveCtrl *ctrl)
508 {
509  return zebra_grs_retrieve(clientData, ctrl, grs_read_xml);
510 }
511 
512 static struct recType filter_type = {
513  0,
514  "grs.xml",
515  filter_init,
516  0,
520 };
521 
522 RecType
523 #if IDZEBRA_STATIC_GRS_XML
524 idzebra_filter_grs_xml
525 #else
527 #endif
528 
529 [] = {
530  &filter_type,
531  0,
532 };
533 
534 #endif
535 
536 /*
537  * Local variables:
538  * c-basic-offset: 4
539  * c-file-style: "Stroustrup"
540  * indent-tabs-mode: nil
541  * End:
542  * vim: shiftwidth=4 tabstop=8 expandtab
543  */
544 
data1_node * data1_mk_tag(data1_handle dh, NMEM nmem, const char *tag, const char **attr, data1_node *at)
Definition: d1_read.c:295
data1_node * data1_insert_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition: d1_read.c:239
data1_node * data1_mk_comment(data1_handle dh, NMEM mem, const char *buf, data1_node *parent)
Definition: d1_read.c:362
#define DATA1N_root
Definition: data1.h:274
data1_node * data1_mk_node2(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition: d1_read.c:146
#define DATA1N_preprocess
Definition: data1.h:284
data1_node * data1_mk_text_n(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:331
data1_node * data1_mk_text_nf(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:339
data1_node * data1_mk_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition: d1_read.c:219
void data1_set_root(data1_handle dh, data1_node *res, NMEM nmem, const char *name)
Definition: d1_read.c:191
static void filter_destroy(void *clientData)
Definition: mod_alvis.c:333
RecType idzebra_filter[]
Definition: mod_alvis.c:722
static int filter_retrieve(void *clientData, struct recRetrieveCtrl *p)
Definition: mod_alvis.c:585
static int filter_extract(void *clientData, struct recExtractCtrl *p)
Definition: mod_alvis.c:563
static struct recType filter_type
Definition: mod_alvis.c:705
static void * filter_init(Res res, RecType recType)
Definition: mod_alvis.c:125
static FILE * inf
Definition: readfile.c:37
int zebra_grs_retrieve(void *clientData, struct recRetrieveCtrl *p, data1_node *(*grs_read)(struct grs_read_info *))
Definition: recgrs.c:1072
int zebra_grs_extract(void *clientData, struct recExtractCtrl *p, data1_node *(*grs_read)(struct grs_read_info *))
Definition: recgrs.c:936
record reader stream
Definition: recctrl.h:71
int(* readf)(struct ZebraRecStream *s, char *buf, size_t count)
read function
Definition: recctrl.h:75
struct data1_node::@2::@7 preprocess
struct data1_node * child
Definition: data1.h:341
union data1_node::@2 u
int which
Definition: data1.h:285
data1_handle dh
Definition: recgrs.h:31
struct ZebraRecStream * stream
Definition: recgrs.h:28
NMEM mem
Definition: recgrs.h:30
record extract for indexing
Definition: recctrl.h:101
Definition: res.c:46
long zint
Zebra integer.
Definition: util.h:66
#define ZINT_FORMAT
Definition: util.h:72