IDZEBRA  2.1.2
mod_grs_xml.c
Go to the documentation of this file.
1 /* This file is part of the Zebra server.
2  Copyright (C) Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
18 */
19 
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 #if HAVE_EXPAT_H
24 
25 #include <assert.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #if HAVE_ICONV_H
29 #include <errno.h>
30 #include <iconv.h>
31 #endif
32 
33 #include <yaz/log.h>
34 
35 #include <idzebra/recgrs.h>
36 
37 #include <yaz/log.h>
38 #include <yaz/xmalloc.h>
39 
40 #include <expat.h>
41 
42 #define XML_CHUNK 1024
43 
44 struct user_info {
45  data1_node *d1_stack[256];
46  int level;
47  data1_handle dh;
48  NMEM nmem;
49  int loglevel;
50 };
51 
52 static void report_xml_error(XML_Parser parser)
53 {
54  zint line = XML_GetCurrentLineNumber(parser);
55  zint col = XML_GetCurrentColumnNumber(parser);
56  yaz_log (YLOG_WARN, ZINT_FORMAT ":" ZINT_FORMAT ":XML error: %s",
57  line, col, XML_ErrorString(XML_GetErrorCode(parser)));
58 }
59 
60 static void cb_start (void *user, const char *el, const char **attr)
61 {
62  struct user_info *ui = (struct user_info*) user;
63  if (ui->level == 1)
64  data1_set_root (ui->dh, ui->d1_stack[0], ui->nmem, el);
65  ui->d1_stack[ui->level] = data1_mk_tag (ui->dh, ui->nmem, el, attr,
66  ui->d1_stack[ui->level-1]);
67  ui->level++;
68  yaz_log (ui->loglevel, "cb_start %s", el);
69 }
70 
71 static void cb_end (void *user, const char *el)
72 {
73  struct user_info *ui = (struct user_info*) user;
74 
75  ui->level--;
76  yaz_log (ui->loglevel, "cb_end %s", el);
77 }
78 
79 static void cb_chardata (void *user, const char *s, int len)
80 {
81  struct user_info *ui = (struct user_info*) user;
82 #if 0
83  yaz_log (ui->loglevel, "cb_chardata %.*s", len, s);
84 #endif
85  ui->d1_stack[ui->level] = data1_mk_text_n (ui->dh, ui->nmem, s, len,
86  ui->d1_stack[ui->level -1]);
87 }
88 
89 static void cb_decl (void *user, const char *version, const char *encoding,
90  int standalone)
91 {
92  struct user_info *ui = (struct user_info*) user;
93  const char *attr_list[7];
94 
95  attr_list[0] = "version";
96  attr_list[1] = version;
97 
98  attr_list[2] = "encoding";
99  attr_list[3] = "UTF-8"; /* internally it's always UTF-8 */
100 
101  attr_list[4] = "standalone";
102  attr_list[5] = standalone ? "yes" : "no";
103 
104  attr_list[6] = 0;
105 
106  data1_mk_preprocess (ui->dh, ui->nmem, "xml", attr_list,
107  ui->d1_stack[ui->level-1]);
108 #if 0
109  yaz_log (YLOG_LOG, "decl version=%s encoding=%s",
110  version ? version : "null",
111  encoding ? encoding : "null");
112 #endif
113 }
114 
115 static void cb_processing (void *user, const char *target,
116  const char *data)
117 {
118  struct user_info *ui = (struct user_info*) user;
119  data1_node *res =
120  data1_mk_preprocess (ui->dh, ui->nmem, target, 0,
121  ui->d1_stack[ui->level-1]);
122  data1_mk_text_nf (ui->dh, ui->nmem, data, strlen(data), res);
123 
124  yaz_log (ui->loglevel, "decl processing target=%s data=%s",
125  target ? target : "null",
126  data ? data : "null");
127 }
128 
129 static void cb_comment (void *user, const char *data)
130 {
131  struct user_info *ui = (struct user_info*) user;
132  yaz_log (ui->loglevel, "decl comment data=%s", data ? data : "null");
133  data1_mk_comment (ui->dh, ui->nmem, data, ui->d1_stack[ui->level-1]);
134 }
135 
136 static void cb_doctype_start (void *userData, const char *doctypeName,
137  const char *sysid, const char *pubid,
138  int has_internal_subset)
139 {
140  struct user_info *ui = (struct user_info*) userData;
141  yaz_log (ui->loglevel, "doctype start doctype=%s sysid=%s pubid=%s",
142  doctypeName, sysid, pubid);
143 }
144 
145 static void cb_doctype_end (void *userData)
146 {
147  struct user_info *ui = (struct user_info*) userData;
148  yaz_log (ui->loglevel, "doctype end");
149 }
150 
151 
152 static void cb_entity_decl (void *userData, const char *entityName,
153  int is_parameter_entity,
154  const char *value, int value_length,
155  const char *base, const char *systemId,
156  const char *publicId, const char *notationName)
157 {
158  struct user_info *ui = (struct user_info*) userData;
159  yaz_log (ui->loglevel,
160  "entity decl %s is_para_entry=%d value=%.*s base=%s systemId=%s"
161  " publicId=%s notationName=%s",
162  entityName, is_parameter_entity, value_length, value,
163  base, systemId, publicId, notationName);
164 
165 }
166 
167 static int cb_external_entity(XML_Parser pparser,
168  const char *context,
169  const char *base,
170  const char *systemId,
171  const char *publicId)
172 {
173  struct user_info *ui = (struct user_info*) XML_GetUserData(pparser);
174  FILE *inf;
175  int done = 0;
176  XML_Parser parser;
177 
178  yaz_log (ui->loglevel,
179  "external entity context=%s base=%s systemid=%s publicid=%s",
180  context, base, systemId, publicId);
181  if (!systemId)
182  return 1;
183 
184  if (!(inf = fopen (systemId, "rb")))
185  {
186  yaz_log (YLOG_WARN|YLOG_ERRNO, "fopen %s", systemId);
187  return 0;
188  }
189 
190  parser = XML_ExternalEntityParserCreate (pparser, "", 0);
191  while (!done)
192  {
193  int r;
194  void *buf = XML_GetBuffer (parser, XML_CHUNK);
195  if (!buf)
196  {
197  yaz_log (YLOG_WARN, "XML_GetBuffer fail");
198  break;
199  }
200  r = fread (buf, 1, XML_CHUNK, inf);
201  if (r == 0)
202  {
203  if (ferror(inf))
204  {
205  yaz_log (YLOG_WARN|YLOG_ERRNO, "fread %s", systemId);
206  break;
207  }
208  done = 1;
209  }
210  if (!XML_ParseBuffer (parser, r, done))
211  {
212  done = 1;
213  report_xml_error(parser);
214  }
215  }
216  fclose (inf);
217  XML_ParserFree (parser);
218  return done;
219 }
220 
221 
222 #if HAVE_ICONV_H
223 static int cb_encoding_convert (void *data, const char *s)
224 {
225  iconv_t t = (iconv_t) data;
226  size_t ret;
227  size_t outleft = 2;
228  char outbuf_[2], *outbuf = outbuf_;
229  size_t inleft = 4;
230  char *inbuf = (char *) s;
231  unsigned short code;
232 
233 #if 1
234  yaz_log(YLOG_LOG, "------------------------- cb_encoding_convert --- ");
235 #endif
236  ret = iconv (t, &inbuf, &inleft, &outbuf, &outleft);
237  if (ret == (size_t) (-1) && errno != E2BIG)
238  {
239  iconv (t, 0, 0, 0, 0);
240  return -1;
241  }
242  if (outleft != 0)
243  return -1;
244  memcpy (&code, outbuf_, sizeof(short));
245  return code;
246 }
247 
248 static void cb_encoding_release (void *data)
249 {
250  iconv_t t = (iconv_t) data;
251  iconv_close (t);
252 }
253 
254 static int cb_encoding_handler (void *userData, const char *name,
255  XML_Encoding *info)
256 {
257  int i = 0;
258  int no_ok = 0;
259  struct user_info *ui = (struct user_info*) userData;
260 
261  iconv_t t = iconv_open ("UNICODE", name);
262  if (t == (iconv_t) (-1))
263  return 0;
264 
265  info->data = 0; /* signal that multibyte is not in use */
266  yaz_log (ui->loglevel, "Encoding handler of %s", name);
267  for (i = 0; i<256; i++)
268  {
269  size_t ret;
270  char outbuf_[5];
271  char inbuf_[5];
272  char *inbuf = inbuf_;
273  char *outbuf = outbuf_;
274  size_t inleft = 1;
275  size_t outleft = 2;
276  inbuf_[0] = i;
277 
278  iconv (t, 0, 0, 0, 0); /* reset iconv */
279 
280  ret = iconv(t, &inbuf, &inleft, &outbuf, &outleft);
281  if (ret == (size_t) (-1))
282  {
283  if (errno == EILSEQ)
284  {
285  yaz_log (ui->loglevel, "Encoding %d: invalid sequence", i);
286  info->map[i] = -1; /* invalid sequence */
287  }
288  if (errno == EINVAL)
289  { /* multi byte input */
290  int len = 2;
291  int j = 0;
292  info->map[i] = -1;
293 
294  while (len <= 4)
295  {
296  char sbuf[80];
297  int k;
298  inbuf = inbuf_;
299  inleft = len;
300  outbuf = outbuf_;
301  outleft = 2;
302 
303  inbuf_[len-1] = j;
304  iconv (t, 0,0,0,0);
305 
306  assert (i >= 0 && i<255);
307 
308  *sbuf = 0;
309  for (k = 0; k<len; k++)
310  {
311  sprintf (sbuf+strlen(sbuf), "%d ", inbuf_[k]&255);
312  }
313  ret = iconv (t, &inbuf, &inleft, &outbuf, &outleft);
314  if (ret == (size_t) (-1))
315  {
316  if (errno == EILSEQ || errno == E2BIG)
317  {
318  j++;
319  if (j > 255)
320  break;
321  }
322  else if (errno == EINVAL)
323  {
324  len++;
325  j = 7;
326  }
327  }
328  else if (outleft == 0)
329  {
330  info->map[i] = -len;
331  info->data = t; /* signal that multibyte is in use */
332  break;
333  }
334  else
335  {
336  break;
337  }
338  }
339  if (info->map[i] < -1)
340  yaz_log (ui->loglevel, "Encoding %d: multibyte input %d",
341  i, -info->map[i]);
342  else
343  yaz_log (ui->loglevel, "Encoding %d: multibyte input failed",
344  i);
345  }
346  if (errno == E2BIG)
347  {
348  info->map[i] = -1; /* no room for output */
349  if (i != 0)
350  yaz_log (YLOG_WARN, "Encoding %d: no room for output",
351  i);
352  }
353  }
354  else if (outleft == 0)
355  {
356  unsigned short code;
357  memcpy (&code, outbuf_, sizeof(short));
358  info->map[i] = code;
359  no_ok++;
360  }
361  else
362  { /* should never happen */
363  info->map[i] = -1;
364  yaz_log (YLOG_DEBUG, "Encoding %d: bad state", i);
365  }
366  }
367  if (info->data)
368  { /* at least one multi byte */
369  info->convert = cb_encoding_convert;
370  info->release = cb_encoding_release;
371  }
372  else
373  {
374  /* no multi byte - we no longer need iconv handler */
375  iconv_close(t);
376  info->convert = 0;
377  info->release = 0;
378  }
379  if (!no_ok)
380  return 0;
381  return 1;
382 }
383 /* HAVE_ICONV_H */
384 #endif
385 
386 static void cb_ns_start(void *userData, const char *prefix, const char *uri)
387 {
388  struct user_info *ui = (struct user_info*) userData;
389  if (prefix && uri)
390  yaz_log(ui->loglevel, "cb_ns_start %s %s", prefix, uri);
391 }
392 
393 static void cb_ns_end(void *userData, const char *prefix)
394 {
395  struct user_info *ui = (struct user_info*) userData;
396  if (prefix)
397  yaz_log(ui->loglevel, "cb_ns_end %s", prefix);
398 }
399 
400 data1_node *zebra_read_xml(data1_handle dh,
401  struct ZebraRecStream *stream,
402  NMEM m)
403 {
404  XML_Parser parser;
405  struct user_info uinfo;
406  int done = 0;
407  data1_node *first_node;
408  int no_read = 0;
409 
410  uinfo.loglevel = YLOG_DEBUG;
411  uinfo.level = 1;
412  uinfo.dh = dh;
413  uinfo.nmem = m;
414  uinfo.d1_stack[0] = data1_mk_node2 (dh, m, DATA1N_root, 0);
415  uinfo.d1_stack[1] = 0; /* indicate no children (see end of routine) */
416 
417  parser = XML_ParserCreate (0 /* encoding */);
418 
419  XML_SetElementHandler (parser, cb_start, cb_end);
420  XML_SetCharacterDataHandler (parser, cb_chardata);
421  XML_SetXmlDeclHandler (parser, cb_decl);
422  XML_SetProcessingInstructionHandler (parser, cb_processing);
423  XML_SetUserData (parser, &uinfo);
424  XML_SetCommentHandler (parser, cb_comment);
425  XML_SetDoctypeDeclHandler (parser, cb_doctype_start, cb_doctype_end);
426  XML_SetEntityDeclHandler (parser, cb_entity_decl);
427  XML_SetExternalEntityRefHandler (parser, cb_external_entity);
428  XML_SetNamespaceDeclHandler(parser, cb_ns_start, cb_ns_end);
429 #if HAVE_ICONV_H
430  XML_SetUnknownEncodingHandler (parser, cb_encoding_handler, &uinfo);
431 #endif
432  while (!done)
433  {
434  int r;
435  void *buf = XML_GetBuffer (parser, XML_CHUNK);
436  if (!buf)
437  {
438  /* error */
439  yaz_log (YLOG_WARN, "XML_GetBuffer fail");
440  break;
441  }
442  r = stream->readf(stream, buf, XML_CHUNK);
443  if (r < 0)
444  {
445  /* error */
446  yaz_log (YLOG_WARN, "XML read fail");
447  break;
448  }
449  else if (r == 0)
450  done = 1;
451  else
452  no_read += r;
453  if (no_read && !XML_ParseBuffer (parser, r, done))
454  {
455  done = 1;
456  report_xml_error(parser);
457  }
458  }
459  XML_ParserFree (parser);
460  if (no_read == 0)
461  return 0;
462  if (!uinfo.d1_stack[1] || !done)
463  return 0;
464  /* insert XML header if not present .. */
465  first_node = uinfo.d1_stack[0]->child;
466  if (first_node->which != DATA1N_preprocess ||
467  strcmp(first_node->u.preprocess.target, "xml"))
468  {
469  const char *attr_list[5];
470 
471  attr_list[0] = "version";
472  attr_list[1] = "1.0";
473 
474  attr_list[2] = "encoding";
475  attr_list[3] = "UTF-8"; /* encoding */
476 
477  attr_list[4] = 0;
478 
479  data1_insert_preprocess (uinfo.dh, uinfo.nmem, "xml", attr_list,
480  uinfo.d1_stack[0]);
481  }
482  return uinfo.d1_stack[0];
483 }
484 
485 struct xml_info {
486  XML_Expat_Version expat_version;
487 };
488 
489 static data1_node *grs_read_xml(struct grs_read_info *p)
490 {
491  return zebra_read_xml(p->dh, p->stream, p->mem);
492 }
493 
494 static void *filter_init(Res res, RecType recType)
495 {
496  struct xml_info *p = (struct xml_info *) xmalloc (sizeof(*p));
497 
498  p->expat_version = XML_ExpatVersionInfo();
499 
500  return p;
501 }
502 
503 static void filter_destroy(void *clientData)
504 {
505  struct xml_info *p = (struct xml_info *) clientData;
506 
507  xfree (p);
508 }
509 
510 static int filter_extract(void *clientData, struct recExtractCtrl *ctrl)
511 {
512  return zebra_grs_extract(clientData, ctrl, grs_read_xml);
513 }
514 
515 static int filter_retrieve(void *clientData, struct recRetrieveCtrl *ctrl)
516 {
517  return zebra_grs_retrieve(clientData, ctrl, grs_read_xml);
518 }
519 
520 static struct recType filter_type = {
521  0,
522  "grs.xml",
523  filter_init,
524  0,
528 };
529 
530 RecType
531 #if IDZEBRA_STATIC_GRS_XML
532 idzebra_filter_grs_xml
533 #else
535 #endif
536 
537 [] = {
538  &filter_type,
539  0,
540 };
541 
542 #endif
543 
544 /*
545  * Local variables:
546  * c-basic-offset: 4
547  * c-file-style: "Stroustrup"
548  * indent-tabs-mode: nil
549  * End:
550  * vim: shiftwidth=4 tabstop=8 expandtab
551  */
552 
static void * filter_init(Res res, RecType recType)
Definition: mod_alvis.c:124
static struct recType filter_type
Definition: mod_alvis.c:704
RecType idzebra_filter[]
Definition: mod_alvis.c:721
#define DATA1N_root
Definition: data1.h:274
static FILE * inf
Definition: readfile.c:37
data1_node * data1_mk_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition: d1_read.c:218
int zebra_grs_extract(void *clientData, struct recExtractCtrl *p, data1_node *(*grs_read)(struct grs_read_info *))
Definition: recgrs.c:935
data1_node * data1_mk_text_nf(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:341
data1_node * data1_mk_tag(data1_handle dh, NMEM nmem, const char *tag, const char **attr, data1_node *at)
Definition: d1_read.c:294
Definition: res.c:46
data1_node * data1_mk_node2(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition: d1_read.c:145
struct data1_node::@2::@7 preprocess
#define DATA1N_preprocess
Definition: data1.h:284
data1_node * data1_mk_comment(data1_handle dh, NMEM mem, const char *buf, data1_node *parent)
Definition: d1_read.c:367
union data1_node::@2 u
struct data1_node * child
Definition: data1.h:341
int(* readf)(struct ZebraRecStream *s, char *buf, size_t count)
read function
Definition: recctrl.h:75
static void filter_destroy(void *clientData)
Definition: mod_alvis.c:332
int which
Definition: data1.h:285
void data1_set_root(data1_handle dh, data1_node *res, NMEM nmem, const char *name)
Definition: d1_read.c:190
data1_node * data1_insert_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition: d1_read.c:238
long zint
Zebra integer.
Definition: util.h:66
data1_node * data1_mk_text_n(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:330
static int filter_retrieve(void *clientData, struct recRetrieveCtrl *p)
Definition: mod_alvis.c:584
static int filter_extract(void *clientData, struct recExtractCtrl *p)
Definition: mod_alvis.c:562
data1_handle dh
Definition: recgrs.h:31
record extract for indexing
Definition: recctrl.h:101
NMEM mem
Definition: recgrs.h:30
int zebra_grs_retrieve(void *clientData, struct recRetrieveCtrl *p, data1_node *(*grs_read)(struct grs_read_info *))
Definition: recgrs.c:1071
struct ZebraRecStream * stream
Definition: recgrs.h:28
record reader stream
Definition: recctrl.h:71
#define ZINT_FORMAT
Definition: util.h:72