IDZEBRA  2.2.7
d1_read.c
Go to the documentation of this file.
1 /* This file is part of the Zebra server.
2  Copyright (C) Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
18 */
19 
20 
21 /*
22  * This module reads "loose" SGML and converts it to data1 tree
23  */
24 
25 #if HAVE_CONFIG_H
26 #include <config.h>
27 #endif
28 #include <assert.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 
32 #include <yaz/snprintf.h>
33 #include <yaz/yaz-util.h>
34 #include <d1_absyn.h>
35 
37 {
38  if (!n)
39  return 0;
40  if (data1_is_xmlmode(dh))
41  {
42  n = n->child;
43  while (n && n->which != DATA1N_tag)
44  n = n->next;
45  }
46  return n;
47 }
48 
49 /*
50  * get the tag which is the immediate parent of this node (this may mean
51  * traversing intermediate things like variants and stuff.
52  */
54 {
55  if (data1_is_xmlmode(dh))
56  {
57  for (; n && n->which != DATA1N_root; n = n->parent)
58  if (n->which == DATA1N_tag && n->parent &&
59  n->parent->which != DATA1N_root)
60  return n;
61  }
62  else
63  {
64  for (; n && n->which != DATA1N_root; n = n->parent)
65  if (n->which == DATA1N_tag)
66  return n;
67  }
68  return 0;
69 }
70 
72 {
73  return data1_mk_node2(dh, m, DATA1N_root, 0);
74 }
75 
77 {
78  return data1_mk_node2(dh, m, type, 0);
79 }
80 
81 static void data1_init_node(data1_handle dh, data1_node *r, int type)
82 {
83  r->which = type;
84  switch(type)
85  {
86  case DATA1N_tag:
87  r->u.tag.tag = 0;
88  r->u.tag.element = 0;
89  r->u.tag.no_data_requested = 0;
90  r->u.tag.node_selected = 0;
91  r->u.tag.make_variantlist = 0;
92  r->u.tag.get_bytes = -1;
93  r->u.tag.attributes = 0;
94  break;
95  case DATA1N_root:
96  r->u.root.type = 0;
97  r->u.root.absyn = 0;
98  break;
99  case DATA1N_data:
100  r->u.data.data = 0;
101  r->u.data.len = 0;
102  r->u.data.what = 0;
103  r->u.data.formatted_text = 0;
104  break;
105  case DATA1N_comment:
106  r->u.data.data = 0;
107  r->u.data.len = 0;
108  r->u.data.what = 0;
109  r->u.data.formatted_text = 1;
110  break;
111  case DATA1N_variant:
112  r->u.variant.type = 0;
113  r->u.variant.value = 0;
114  break;
115  case DATA1N_preprocess:
116  r->u.preprocess.target = 0;
117  r->u.preprocess.attributes = 0;
118  break;
119  default:
120  yaz_log(YLOG_WARN, "data_mk_node_type. bad type = %d\n", type);
121  }
122 }
123 
125  data1_node *parent)
126 {
127  data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));
128  r->next = r->child = r->last_child = 0;
129 
130  r->parent = parent;
131  if (!parent)
132  r->root = r;
133  else
134  {
135  r->root = parent->root;
136  if (!parent->child)
137  parent->child = parent->last_child = r;
138  else
139  parent->last_child->next = r;
140  parent->last_child = r;
141  }
142  data1_init_node(dh, r, type);
143  return r;
144 }
145 
146 data1_node *data1_mk_node2(data1_handle dh, NMEM m, int type,
147  data1_node *parent)
148 {
149  return data1_append_node(dh, m, type, parent);
150 }
151 
153  data1_node *parent)
154 {
155  data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));
156  r->next = r->child = r->last_child = 0;
157 
158  if (!parent)
159  r->root = r;
160  else
161  {
162  r->root = parent->root;
163  r->parent = parent;
164  if (!parent->child)
165  parent->last_child = r;
166  else
167  r->next = parent->child;
168  parent->child = r;
169  }
170  data1_init_node(dh, r, type);
171  return r;
172 }
173 
174 data1_node *data1_mk_root(data1_handle dh, NMEM nmem, const char *name)
175 {
176  data1_absyn *absyn = data1_get_absyn(dh, name, 1);
177  data1_node *res;
178 
179  if (!absyn)
180  {
181  yaz_log(YLOG_WARN, "Unable to acquire abstract syntax " "for '%s'",
182  name);
183  /* It's now OK for a record not to have an absyn */
184  }
185  res = data1_mk_node2(dh, nmem, DATA1N_root, 0);
186  res->u.root.type = data1_insert_string(dh, res, nmem, name);
187  res->u.root.absyn = absyn;
188  return res;
189 }
190 
192  NMEM nmem, const char *name)
193 {
194  data1_absyn *absyn = data1_get_absyn(
195  dh, name, DATA1_XPATH_INDEXING_ENABLE);
196 
197  res->u.root.type = data1_insert_string(dh, res, nmem, name);
198  res->u.root.absyn = absyn;
199 }
200 
201 void data1_add_attrs(data1_handle dh, NMEM nmem, const char **attr,
202  data1_xattr **p)
203 {
204  while (*p)
205  p = &(*p)->next;
206 
207  while (attr && *attr)
208  {
209  *p = (data1_xattr*) nmem_malloc(nmem, sizeof(**p));
210  (*p)->name = nmem_strdup(nmem, *attr++);
211  (*p)->value = nmem_strdup(nmem, *attr++);
212  (*p)->what = DATA1I_text;
213 
214  p = &(*p)->next;
215  }
216  *p = 0;
217 }
218 
220  const char *target,
221  const char **attr, data1_node *at)
222 {
223  return data1_mk_preprocess_n(dh, nmem, target, strlen(target),
224  attr, at);
225 }
226 
228  const char *target, size_t len,
229  const char **attr, data1_node *at)
230 {
231  data1_node *res = data1_mk_node2(dh, nmem, DATA1N_preprocess, at);
232  res->u.preprocess.target = data1_insert_string_n(dh, res, nmem,
233  target, len);
234 
235  data1_add_attrs(dh, nmem, attr, &res->u.preprocess.attributes);
236  return res;
237 }
238 
240  const char *target,
241  const char **attr, data1_node *at)
242 {
243  return data1_insert_preprocess_n(dh, nmem, target, strlen(target),
244  attr, at);
245 }
246 
248  const char *target, size_t len,
249  const char **attr, data1_node *at)
250 {
251  data1_node *res = data1_insert_node(dh, nmem, DATA1N_preprocess, at);
252  res->u.preprocess.target = data1_insert_string_n(dh, res, nmem,
253  target, len);
254 
255  data1_add_attrs(dh, nmem, attr, &res->u.preprocess.attributes);
256  return res;
257 }
258 
260  const char *tag, size_t len, const char **attr,
261  data1_node *at)
262 {
263  data1_node *partag = get_parent_tag(dh, at);
264  data1_node *res = data1_mk_node2(dh, nmem, DATA1N_tag, at);
265  data1_element *e = 0;
266 
267  res->u.tag.tag = data1_insert_string_n(dh, res, nmem, tag, len);
268 
269  if (!partag) /* top tag? */
270  e = data1_getelementbytagname(dh, at->root->u.root.absyn,
271  0 /* index as local */,
272  res->u.tag.tag);
273  else
274  {
275  /* only set element for known tags */
276  e = partag->u.tag.element;
277  if (e)
278  e = data1_getelementbytagname(dh, at->root->u.root.absyn,
279  e, res->u.tag.tag);
280  }
281  res->u.tag.element = e;
282  data1_add_attrs(dh, nmem, attr, &res->u.tag.attributes);
283  return res;
284 }
285 
286 void data1_tag_add_attr(data1_handle dh, NMEM nmem,
287  data1_node *res, const char **attr)
288 {
289  if (res->which != DATA1N_tag)
290  return;
291 
292  data1_add_attrs(dh, nmem, attr, &res->u.tag.attributes);
293 }
294 
296  const char *tag, const char **attr, data1_node *at)
297 {
298  return data1_mk_tag_n(dh, nmem, tag, strlen(tag), attr, at);
299 }
300 
302  const char *tag)
303 {
304  if (*tag == '/')
305  {
306  n = data1_get_root_tag(dh, n);
307  if (n)
308  n = n->child;
309  tag++;
310  }
311  for (; n; n = n->next)
312  if (n->which == DATA1N_tag && n->u.tag.tag &&
313  !yaz_matchstr(n->u.tag.tag, tag))
314  {
315  return n;
316  }
317  return 0;
318 }
319 
321  const char *tag, data1_node *at)
322 {
323  data1_node *node = data1_search_tag(dh, at->child, tag);
324  if (!node)
325  node = data1_mk_tag(dh, nmem, tag, 0 /* attr */, at);
326  else
327  node->child = node->last_child = 0;
328  return node;
329 }
330 
332  const char *buf, size_t len, data1_node *parent)
333 {
334  data1_node *res = data1_mk_node2(dh, mem, DATA1N_data, parent);
335  data1_set_data_string_n(dh, res, mem, buf, len);
336  return res;
337 }
338 
340  const char *buf, size_t len, data1_node *parent)
341 {
342  data1_node *res = data1_mk_text_n(dh, mem, buf, len, parent);
343  res->u.data.formatted_text = 1;
344  return res;
345 }
346 
348  const char *buf, data1_node *parent)
349 {
350  return data1_mk_text_n(dh, mem, buf, strlen(buf), parent);
351 }
352 
354  const char *buf, size_t len,
355  data1_node *parent)
356 {
357  data1_node *res = data1_mk_node2(dh, mem, DATA1N_comment, parent);
358  data1_set_data_string_n(dh, res, mem, buf, len);
359  return res;
360 }
361 
363  const char *buf, data1_node *parent)
364 {
365  return data1_mk_comment_n(dh, mem, buf, strlen(buf), parent);
366 }
367 
369  const char *str, size_t len)
370 {
371  res->u.data.what = DATA1I_text;
372  res->u.data.data = data1_insert_string_n(dh, res, m, str, len);
373  res->u.data.len = len;
374 }
375 
377  const char *str)
378 {
379  data1_set_data_string_n(dh, res, m, str, strlen(str));
380 }
381 
383  NMEM m, const char *str, size_t len)
384 {
385  char *b;
386  if (len >= DATA1_LOCALDATA)
387  b = (char *) nmem_malloc(m, len+1);
388  else
389  b = res->lbuf;
390  memcpy(b, str, len);
391  b[len] = 0;
392  return b;
393 }
394 
395 char *data1_insert_zint(data1_handle dh, data1_node *res, NMEM m, zint num)
396 {
397  char str[64];
398 
399  yaz_snprintf(str, sizeof(str), ZINT_FORMAT, num);
400  return data1_insert_string(dh, res, m, str);
401 }
402 
403 void data1_set_data_zint(data1_handle dh, data1_node *res, NMEM m, zint num)
404 {
405  res->u.data.what = DATA1I_num;
406  res->u.data.data = data1_insert_zint(dh, res, m, num);
407  res->u.data.len = strlen(res->u.data.data);
408 }
409 
411  NMEM m, const char *str)
412 {
413  return data1_insert_string_n(dh, res, m, str, strlen(str));
414 }
415 
417  data1_node *at,
418  const char *tagname, NMEM m,
419  int local_allowed,
420  int insert_mode)
421 {
422  data1_node *root = at->root;
423  data1_node *partag = get_parent_tag(dh, at);
424  data1_element *e = NULL;
425  data1_node *datn = 0;
426  data1_node *tagn = 0;
427 
428  if (!partag)
429  e = data1_getelementbytagname(dh, root->u.root.absyn, 0, tagname);
430  else
431  {
432  e = partag->u.tag.element;
433  if (e)
434  e = data1_getelementbytagname(dh, root->u.root.absyn, e, tagname);
435  }
436  if (local_allowed || e)
437  {
438  if (insert_mode)
439  tagn = data1_insert_node(dh, m, DATA1N_tag, at);
440  else
441  tagn = data1_append_node(dh, m, DATA1N_tag, at);
442  tagn->u.tag.tag = data1_insert_string(dh, tagn, m, tagname);
443  tagn->u.tag.element = e;
444  datn = data1_mk_node2(dh, m, DATA1N_data, tagn);
445  }
446  return datn;
447 }
448 
450  const char *tagname, NMEM m)
451 {
452  return data1_add_insert_taggeddata(dh, at, tagname, m, 1, 0);
453 }
454 
455 
456 /*
457  * Insert a tagged node into the record root as first child of the node at
458  * which should be root or tag itself). Returns pointer to the data node,
459  * which can then be modified.
460  */
462  const char *tagname, NMEM m)
463 {
464  return data1_add_insert_taggeddata(dh, at, tagname, m, 0, 1);
465 }
466 
468  data1_node *at, const char *tagname,
469  NMEM m)
470 {
471  return data1_add_insert_taggeddata(dh, at, tagname, m, 0, 1);
472 }
473 
475  data1_node *at, const char *tagname,
476  NMEM m)
477 {
478  return data1_add_insert_taggeddata(dh, at, tagname, m, 1, 0);
479 }
480 
482  const char *tag, zint num,
483  NMEM nmem)
484 {
485  data1_node *node_data;
486 
487  node_data = data1_mk_tag_data(dh, at, tag, nmem);
488  if (!node_data)
489  return 0;
490  data1_set_data_zint(dh, node_data, nmem, num);
491  return node_data;
492 }
493 
495  const char *tag, int num,
496  NMEM nmem)
497 {
498  return data1_mk_tag_data_zint(dh, at, tag, num, nmem);
499 }
500 
502  const char *tag, Odr_oid *oid,
503  NMEM nmem)
504 {
505  data1_node *node_data;
506  char str[128], *p = str;
507  size_t i;
508 
509  node_data = data1_mk_tag_data(dh, at, tag, nmem);
510  if (!node_data)
511  return 0;
512 
513  for (i = 0; i < 14 && oid[i] >= 0; i++)
514  {
515  if (i > 0)
516  *p++ = '.';
517  yaz_snprintf(p, 7, "%d", oid[i]);
518  p += strlen(p);
519  }
520  data1_set_data_string(dh, node_data, nmem, str);
521  node_data->u.data.what = DATA1I_oid;
522  return node_data;
523 }
524 
525 
527  const char *tag, const char *str,
528  NMEM nmem)
529 {
530  data1_node *node_data = data1_mk_tag_data(dh, at, tag, nmem);
531  if (!node_data)
532  return 0;
533  data1_set_data_string(dh, node_data, nmem, str);
534  return node_data;
535 }
536 
537 
539  const char *tag, const char *str,
540  NMEM nmem)
541 {
542  data1_node *node = data1_search_tag(dh, at->child, tag);
543  if (!node)
544  return data1_mk_tag_data_text(dh, at, tag, str, nmem);
545  node = node->child;
546  data1_set_data_string(dh, node, nmem, str);
547  node->child = node->last_child = 0;
548  return node;
549 }
550 
551 static int ampr(int (*get_byte)(void *fh), void *fh, int *amp)
552 {
553  int c = (*get_byte)(fh);
554  *amp = 0;
555  return c;
556 }
557 
559  int (*get_byte)(void *fh), void *fh,
560  WRBUF wrbuf, int *ch, int *amp)
561 {
562  data1_xattr *p_first = 0;
563  data1_xattr **pp = &p_first;
564  int c = *ch;
565  for (;;)
566  {
567  data1_xattr *p;
568  while (*amp || (c && d1_isspace(c)))
569  c = ampr(get_byte, fh, amp);
570  if (*amp == 0 && (c == 0 || c == '>' || c == '/'))
571  break;
572  *pp = p = (data1_xattr *) nmem_malloc(m, sizeof(*p));
573  p->next = 0;
574  pp = &p->next;
575  p->value = 0;
576  p->what = DATA1I_xmltext;
577 
578  wrbuf_rewind(wrbuf);
579  while (c && c != '=' && c != '>' && c != '/' && !d1_isspace(c))
580  {
581  wrbuf_putc(wrbuf, c);
582  c = ampr(get_byte, fh, amp);
583  }
584  p->name = nmem_strdup(m, wrbuf_cstr(wrbuf));
585  if (c == '=')
586  {
587  c = ampr(get_byte, fh, amp);
588  if (*amp == 0 && c == '"')
589  {
590  c = ampr(get_byte, fh, amp);
591  wrbuf_rewind(wrbuf);
592  while (*amp || (c && c != '"'))
593  {
594  wrbuf_putc(wrbuf, c);
595  c = ampr(get_byte, fh, amp);
596  }
597  if (c)
598  c = ampr(get_byte, fh, amp);
599  }
600  else if (*amp == 0 && c == '\'')
601  {
602  c = ampr(get_byte, fh, amp);
603  wrbuf_rewind(wrbuf);
604  while (*amp || (c && c != '\''))
605  {
606  wrbuf_putc(wrbuf, c);
607  c = ampr(get_byte, fh, amp);
608  }
609  if (c)
610  c = ampr(get_byte, fh, amp);
611  }
612  else
613  {
614  wrbuf_rewind(wrbuf);
615  while (*amp || (c && c != '>' && c != '/'))
616  {
617  wrbuf_putc(wrbuf, c);
618  c = ampr(get_byte, fh, amp);
619  }
620  }
621  p->value = nmem_strdup(m, wrbuf_cstr(wrbuf));
622  }
623  }
624  *ch = c;
625  return p_first;
626 }
627 
628 /*
629  * Ugh. Sometimes functions just grow and grow on you. This one reads a
630  * 'node' and its children.
631  */
633  int (*get_byte)(void *fh), void *fh, WRBUF wrbuf)
634 {
635  data1_node *d1_stack[256];
636  data1_node *res;
637  int c, amp;
638  int level = 0;
639  int line = 1;
640 
641  d1_stack[level] = 0;
642  c = ampr(get_byte, fh, &amp);
643  while (c != '\0')
644  {
645  data1_node *parent = level ? d1_stack[level-1] : 0;
646 
647  if (amp == 0 && c == '<') /* beginning of tag */
648  {
649  data1_xattr *xattr;
650 
651  char tag[256];
652  int null_tag = 0;
653  int end_tag = 0;
654  size_t i = 0;
655 
656  c = ampr(get_byte, fh, &amp);
657  if (amp == 0 && c == '/')
658  {
659  end_tag = 1;
660  c = ampr(get_byte, fh, &amp);
661  }
662  else if (amp == 0 && c == '?')
663  {
664  int quote_mode = 0;
665  while ((c = ampr(get_byte, fh, &amp)))
666  {
667  if (amp)
668  continue;
669  if (quote_mode == 0)
670  {
671  if (c == '"')
672  quote_mode = c;
673  else if (c == '\'')
674  quote_mode = c;
675  else if (c == '>')
676  {
677  c = ampr(get_byte, fh, &amp);
678  break;
679  }
680  }
681  else
682  {
683  if (amp == 0 && c == quote_mode)
684  quote_mode = 0;
685  }
686  }
687  continue;
688  }
689  else if (amp == 0 && c == '!')
690  {
691  int c0, amp0;
692 
693  wrbuf_rewind(wrbuf);
694 
695  c0 = ampr(get_byte, fh, &amp0);
696  if (amp0 == 0 && c0 == '\0')
697  break;
698  c = ampr(get_byte, fh, &amp);
699 
700  if (amp0 == 0 && c0 == '-' && amp == 0 && c == '-')
701  {
702  /* COMMENT: <!-- ... --> */
703  int no_dash = 0;
704 
705  c = ampr(get_byte, fh, &amp);
706  while (amp || c)
707  {
708  if (amp == 0 && c == '-')
709  no_dash++;
710  else if (amp == 0 && c == '>' && no_dash >= 2)
711  {
712  if (level > 0)
713  d1_stack[level] =
715  dh, m,
716  wrbuf_buf(wrbuf), wrbuf_len(wrbuf)-2,
717  d1_stack[level-1]);
718  c = ampr(get_byte, fh, &amp); /* skip > */
719  break;
720  }
721  else
722  no_dash = 0;
723  wrbuf_putc(wrbuf, c);
724  c = ampr(get_byte, fh, &amp);
725  }
726  continue;
727  }
728  else
729  { /* DIRECTIVE: <! .. > */
730 
731  int blevel = 0;
732  while (amp || c)
733  {
734  if (amp == 0 && c == '>' && blevel == 0)
735  {
736  c = ampr(get_byte, fh, &amp);
737  break;
738  }
739  if (amp == 0 && c == '[')
740  blevel++;
741  if (amp == 0 && c == ']' && blevel > 0)
742  blevel--;
743  c = ampr(get_byte, fh, &amp);
744  }
745  continue;
746  }
747  }
748  while (amp || (c && c != '>' && c != '/' && !d1_isspace(c)))
749  {
750  if (i < (sizeof(tag)-1))
751  tag[i++] = c;
752  c = ampr(get_byte, fh, &amp);
753  }
754  tag[i] = '\0';
755  xattr = data1_read_xattr(dh, m, get_byte, fh, wrbuf, &c, &amp);
756  if (amp == 0 && c == '/')
757  { /* <tag attrs/> or <tag/> */
758  null_tag = 1;
759  c = ampr(get_byte, fh, &amp);
760  }
761  if (amp || c != '>')
762  {
763  yaz_log(YLOG_WARN, "d1: %d: Malformed tag", line);
764  return 0;
765  }
766  else
767  c = ampr(get_byte, fh, &amp);
768 
769  /* End tag? */
770  if (end_tag)
771  {
772  if (*tag == '\0')
773  --level; /* </> */
774  else
775  { /* </tag> */
776  int i = level;
777  while (i > 0)
778  {
779  parent = d1_stack[--i];
780  if ((parent->which == DATA1N_root &&
781  !strcmp(tag, parent->u.root.type)) ||
782  (parent->which == DATA1N_tag &&
783  !strcmp(tag, parent->u.tag.tag)))
784  {
785  level = i;
786  break;
787  }
788  }
789  if (i != level)
790  {
791  yaz_log(YLOG_WARN, "%d: no begin tag for %s",
792  line, tag);
793  break;
794  }
795  }
796  if (data1_is_xmlmode(dh))
797  {
798  if (level <= 1)
799  return d1_stack[0];
800  }
801  else
802  {
803  if (level <= 0)
804  return d1_stack[0];
805  }
806  continue;
807  }
808  else if (!strcmp(tag, "var")
809  && xattr && xattr->next && xattr->next->next
810  && xattr->value == 0
811  && xattr->next->value == 0
812  && xattr->next->next->value == 0)
813  {
814  /* <var class type value> */
815  const char *tclass = xattr->name;
816  const char *type = xattr->next->name;
817  const char *value = xattr->next->name;
818  data1_vartype *tp;
819 
820  yaz_log(YLOG_LOG, "Variant class=%s type=%s value=%s",
821  tclass, type, value);
822  if (!(tp =
824  parent->root->u.root.absyn->varset,
825  tclass, type)))
826  continue;
827  /*
828  * If we're the first variant in this group, create a parent
829  * variant, and insert it before the current variant.
830  */
831  if (parent->which != DATA1N_variant)
832  {
833  res = data1_mk_node2(dh, m, DATA1N_variant, parent);
834  }
835  else
836  {
837  /*
838  * now determine if one of our ancestor triples is of
839  * same type. If so, we break here.
840  */
841  int i;
842  for (i = level-1; d1_stack[i]->which==DATA1N_variant; --i)
843  if (d1_stack[i]->u.variant.type == tp)
844  {
845  level = i;
846  break;
847  }
848  res = data1_mk_node2(dh, m, DATA1N_variant, parent);
849  res->u.variant.type = tp;
850  res->u.variant.value =
851  data1_insert_string(dh, res, m, value);
852  }
853  }
854  else
855  {
856 
857  /* tag .. acquire our element in the abstract syntax */
858  if (level == 0)
859  {
860  parent = data1_mk_root(dh, m, tag);
861  res = d1_stack[level] = parent;
862 
863  if (data1_is_xmlmode(dh))
864  {
865  level++;
866  res = data1_mk_tag(dh, m, tag, 0 /* attr */, parent);
867  res->u.tag.attributes = xattr;
868  }
869  }
870  else
871  {
872  res = data1_mk_tag(dh, m, tag, 0 /* attr */, parent);
873  res->u.tag.attributes = xattr;
874  }
875  }
876  d1_stack[level] = res;
877  d1_stack[level+1] = 0;
878  if (level < 250 && !null_tag)
879  ++level;
880  }
881  else /* != '<'... this is a body of text */
882  {
883  int len;
884 
885  if (level == 0)
886  {
887  c = ampr(get_byte, fh, &amp);
888  continue;
889  }
890  res = data1_mk_node2(dh, m, DATA1N_data, parent);
891  res->u.data.what = DATA1I_xmltext;
892  res->u.data.formatted_text = 0;
893  d1_stack[level] = res;
894 
895  wrbuf_rewind(wrbuf);
896 
897  while (amp || (c && c != '<'))
898  {
899  wrbuf_putc(wrbuf, c);
900  c = ampr(get_byte, fh, &amp);
901  }
902  len = wrbuf_len(wrbuf);
903 
904  /* use local buffer of nmem if too large */
905  if (len >= DATA1_LOCALDATA)
906  res->u.data.data = (char*) nmem_malloc(m, len);
907  else
908  res->u.data.data = res->lbuf;
909 
910  if (len)
911  memcpy(res->u.data.data, wrbuf_buf(wrbuf), len);
912  else
913  res->u.data.data = 0;
914  res->u.data.len = len;
915  }
916  }
917  return 0;
918 }
919 
920 int getc_mem(void *fh)
921 {
922  const char **p = (const char **) fh;
923  if (**p)
924  return *(*p)++;
925  return 0;
926 }
927 
928 data1_node *data1_read_node(data1_handle dh, const char **buf, NMEM m)
929 {
930  WRBUF wrbuf = wrbuf_alloc();
931  data1_node *node;
932 
933  node = data1_read_nodex(dh, m, getc_mem, (void *) (buf), wrbuf);
934  wrbuf_destroy(wrbuf);
935  return node;
936 }
937 
938 /*
939  * Read a record in the native syntax.
940  */
942  int (*rf)(void *, char *, size_t), void *fh,
943  NMEM m)
944 {
945  int *size;
946  char **buf = data1_get_read_buf(dh, &size);
947  const char *bp;
948  int rd = 0, res;
949 
950  if (!*buf)
951  *buf = (char *)xmalloc(*size = 4096);
952 
953  for (;;)
954  {
955  if (rd + 2048 >= *size && !(*buf =(char *)xrealloc(*buf, *size *= 2)))
956  abort();
957  if ((res = (*rf)(fh, *buf + rd, 2048)) <= 0)
958  {
959  if (!res)
960  {
961  bp = *buf;
962  (*buf)[rd] = '\0';
963  return data1_read_node(dh, &bp, m);
964  }
965  else
966  return 0;
967  }
968  rd += res;
969  }
970 }
971 
972 data1_node *data1_read_sgml(data1_handle dh, NMEM m, const char *buf)
973 {
974  const char *bp = buf;
975  return data1_read_node(dh, &bp, m);
976 }
977 
978 
979 static int conv_item(NMEM m, yaz_iconv_t t,
980  WRBUF wrbuf, char *inbuf, size_t inlen)
981 {
982  wrbuf_rewind(wrbuf);
983  wrbuf_iconv_write(wrbuf, t, inbuf, inlen);
984  wrbuf_iconv_reset(wrbuf, t);
985  return 0;
986 }
987 
988 static void data1_iconv_s(data1_handle dh, NMEM m, data1_node *n,
989  yaz_iconv_t t, WRBUF wrbuf, const char *tocode)
990 {
991  for (; n; n = n->next)
992  {
993  switch (n->which)
994  {
995  case DATA1N_data:
996  case DATA1N_comment:
997  if (conv_item(m, t, wrbuf, n->u.data.data, n->u.data.len) == 0)
998  {
999  n->u.data.data =
1000  data1_insert_string_n(dh, n, m, wrbuf->buf, wrbuf->pos);
1001  n->u.data.len = wrbuf->pos;
1002  }
1003  break;
1004  case DATA1N_tag:
1005  if (conv_item(m, t, wrbuf, n->u.tag.tag, strlen(n->u.tag.tag))
1006  == 0)
1007  {
1008  n->u.tag.tag =
1009  data1_insert_string_n(dh, n, m, wrbuf->buf, wrbuf->pos);
1010  }
1011  if (n->u.tag.attributes)
1012  {
1013  data1_xattr *p;
1014  for (p = n->u.tag.attributes; p; p = p->next)
1015  {
1016  if (p->value &&
1017  conv_item(m, t, wrbuf, p->value, strlen(p->value))
1018  == 0)
1019  {
1020  p->value = nmem_strdup(m, wrbuf_cstr(wrbuf));
1021  }
1022  }
1023  }
1024  break;
1025  case DATA1N_preprocess:
1026  if (strcmp(n->u.preprocess.target, "xml") == 0)
1027  {
1028  data1_xattr *p = n->u.preprocess.attributes;
1029  for (; p; p = p->next)
1030  if (strcmp(p->name, "encoding") == 0)
1031  p->value = nmem_strdup(m, tocode);
1032  }
1033  break;
1034  }
1035  data1_iconv_s(dh, m, n->child, t, wrbuf, tocode);
1036  }
1037 }
1038 
1040 {
1041  /* see if we have an xml header that specifies encoding */
1042  if (n && n->child && n->child->which == DATA1N_preprocess &&
1043  strcmp(n->child->u.preprocess.target, "xml") == 0)
1044  {
1045  data1_xattr *xp = n->child->u.preprocess.attributes;
1046  for (; xp; xp = xp->next)
1047  if (strcmp(xp->name, "encoding") == 0)
1048  return xp->value;
1049  }
1050  /* no encoding in header, so see if "encoding" was specified for abs */
1051  if (n && n->which == DATA1N_root &&
1052  n->u.root.absyn && n->u.root.absyn->encoding)
1053  return n->u.root.absyn->encoding;
1054  /* none of above, return a hard coded default */
1055  return "ISO-8859-1";
1056 }
1057 
1059  const char *tocode,
1060  const char *fromcode)
1061 {
1062  if (yaz_matchstr(tocode, fromcode))
1063  {
1064  WRBUF wrbuf = wrbuf_alloc();
1065  yaz_iconv_t t = yaz_iconv_open(tocode, fromcode);
1066  if (!t)
1067  {
1068  wrbuf_destroy(wrbuf);
1069  return -1;
1070  }
1071  data1_iconv_s(dh, m, n, t, wrbuf, tocode);
1072  yaz_iconv_close(t);
1073  wrbuf_destroy(wrbuf);
1074  }
1075  return 0;
1076 }
1077 
1079 {
1080  for (; n; n = n->next)
1081  {
1082  if (n->which == DATA1N_data)
1083  {
1084 
1085  int sz = n->u.data.len;
1086  const char *ndata = n->u.data.data;
1087  int off = 0;
1088 
1089  for (off = 0; off < sz; off++)
1090  if (!d1_isspace(ndata[off]))
1091  break;
1092  sz = sz - off;
1093  ndata += off;
1094 
1095  while (sz && d1_isspace(ndata[sz - 1]))
1096  sz--;
1097 
1098  n->u.data.data = nmem_malloc(m, sz);
1099  n->u.data.len = sz;
1100  memcpy(n->u.data.data, ndata, sz);
1101 
1102  }
1103  data1_chop_text(dh, m, n->child);
1104  }
1105 }
1106 
1108 {
1109  for (; n; n = n->next)
1110  {
1111  if (n->which == DATA1N_data && n->next &&
1112  n->next->which == DATA1N_data)
1113  {
1114  int sz = 0;
1115  int off = 0;
1116  char *ndata;
1117  data1_node *np;
1118  for (np = n; np && np->which == DATA1N_data; np=np->next)
1119  sz += np->u.data.len;
1120  ndata = nmem_malloc(m, sz);
1121  for (np = n; np && np->which == DATA1N_data; np=np->next)
1122  {
1123  memcpy(ndata+off, np->u.data.data, np->u.data.len);
1124  off += np->u.data.len;
1125  }
1126  n->u.data.data = ndata;
1127  n->u.data.len = sz;
1128  n->next = np;
1129  if (!np && n->parent)
1130  n->parent->last_child = n;
1131 
1132  }
1133  data1_concat_text(dh, m, n->child);
1134  }
1135 }
1136 
1137 /*
1138  * Local variables:
1139  * c-basic-offset: 4
1140  * c-file-style: "Stroustrup"
1141  * indent-tabs-mode: nil
1142  * End:
1143  * vim: shiftwidth=4 tabstop=8 expandtab
1144  */
1145 
data1_node * data1_mk_tag(data1_handle dh, NMEM nmem, const char *tag, const char **attr, data1_node *at)
Definition: d1_read.c:295
int data1_iconv(data1_handle dh, NMEM m, data1_node *n, const char *tocode, const char *fromcode)
Definition: d1_read.c:1058
data1_node * data1_search_tag(data1_handle dh, data1_node *n, const char *tag)
Definition: d1_read.c:301
data1_node * data1_mk_root(data1_handle dh, NMEM nmem, const char *name)
Definition: d1_read.c:174
void data1_add_attrs(data1_handle dh, NMEM nmem, const char **attr, data1_xattr **p)
Definition: d1_read.c:201
char * data1_insert_string(data1_handle dh, data1_node *res, NMEM m, const char *str)
Definition: d1_read.c:410
char * data1_insert_string_n(data1_handle dh, data1_node *res, NMEM m, const char *str, size_t len)
Definition: d1_read.c:382
data1_node * data1_mk_node_type(data1_handle dh, NMEM m, int type)
Definition: d1_read.c:76
data1_node * data1_insert_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition: d1_read.c:239
static void data1_init_node(data1_handle dh, data1_node *r, int type)
Definition: d1_read.c:81
data1_node * data1_mk_tag_data_text(data1_handle dh, data1_node *at, const char *tag, const char *str, NMEM nmem)
Definition: d1_read.c:526
void data1_tag_add_attr(data1_handle dh, NMEM nmem, data1_node *res, const char **attr)
Definition: d1_read.c:286
data1_node * data1_mk_comment_n(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:353
void data1_concat_text(data1_handle dh, NMEM m, data1_node *n)
Definition: d1_read.c:1107
data1_node * data1_read_sgml(data1_handle dh, NMEM m, const char *buf)
Definition: d1_read.c:972
data1_xattr * data1_read_xattr(data1_handle dh, NMEM m, int(*get_byte)(void *fh), void *fh, WRBUF wrbuf, int *ch, int *amp)
Definition: d1_read.c:558
data1_node * data1_mk_tag_data_wd(data1_handle dh, data1_node *at, const char *tagname, NMEM m)
Definition: d1_read.c:461
data1_node * data1_insert_node(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition: d1_read.c:152
data1_node * data1_read_nodex(data1_handle dh, NMEM m, int(*get_byte)(void *fh), void *fh, WRBUF wrbuf)
Definition: d1_read.c:632
char * data1_insert_zint(data1_handle dh, data1_node *res, NMEM m, zint num)
Definition: d1_read.c:395
data1_node * data1_mk_node(data1_handle dh, NMEM m)
Definition: d1_read.c:71
void data1_set_data_string(data1_handle dh, data1_node *res, NMEM m, const char *str)
Definition: d1_read.c:376
void data1_set_data_zint(data1_handle dh, data1_node *res, NMEM m, zint num)
Definition: d1_read.c:403
static void data1_iconv_s(data1_handle dh, NMEM m, data1_node *n, yaz_iconv_t t, WRBUF wrbuf, const char *tocode)
Definition: d1_read.c:988
data1_node * data1_append_node(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition: d1_read.c:124
data1_node * data1_read_record(data1_handle dh, int(*rf)(void *, char *, size_t), void *fh, NMEM m)
Definition: d1_read.c:941
data1_node * data1_add_taggeddata(data1_handle dh, data1_node *root, data1_node *at, const char *tagname, NMEM m)
Definition: d1_read.c:474
data1_node * data1_get_root_tag(data1_handle dh, data1_node *n)
Definition: d1_read.c:36
data1_node * data1_insert_preprocess_n(data1_handle dh, NMEM nmem, const char *target, size_t len, const char **attr, data1_node *at)
Definition: d1_read.c:247
static data1_node * data1_add_insert_taggeddata(data1_handle dh, data1_node *at, const char *tagname, NMEM m, int local_allowed, int insert_mode)
Definition: d1_read.c:416
data1_node * data1_mk_comment(data1_handle dh, NMEM mem, const char *buf, data1_node *parent)
Definition: d1_read.c:362
data1_node * data1_mk_tag_n(data1_handle dh, NMEM nmem, const char *tag, size_t len, const char **attr, data1_node *at)
Definition: d1_read.c:259
data1_node * data1_mk_tag_data_text_uni(data1_handle dh, data1_node *at, const char *tag, const char *str, NMEM nmem)
Definition: d1_read.c:538
data1_node * data1_mk_tag_uni(data1_handle dh, NMEM nmem, const char *tag, data1_node *at)
Definition: d1_read.c:320
data1_node * data1_mk_node2(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition: d1_read.c:146
data1_node * get_parent_tag(data1_handle dh, data1_node *n)
Definition: d1_read.c:53
data1_node * data1_mk_tag_data_zint(data1_handle dh, data1_node *at, const char *tag, zint num, NMEM nmem)
Definition: d1_read.c:481
void data1_chop_text(data1_handle dh, NMEM m, data1_node *n)
Definition: d1_read.c:1078
data1_node * data1_mk_text_n(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:331
data1_node * data1_mk_tag_data(data1_handle dh, data1_node *at, const char *tagname, NMEM m)
Definition: d1_read.c:449
data1_node * data1_mk_text_nf(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:339
data1_node * data1_mk_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition: d1_read.c:219
data1_node * data1_insert_taggeddata(data1_handle dh, data1_node *root, data1_node *at, const char *tagname, NMEM m)
Definition: d1_read.c:467
data1_node * data1_read_node(data1_handle dh, const char **buf, NMEM m)
Definition: d1_read.c:928
void data1_set_data_string_n(data1_handle dh, data1_node *res, NMEM m, const char *str, size_t len)
Definition: d1_read.c:368
data1_node * data1_mk_preprocess_n(data1_handle dh, NMEM nmem, const char *target, size_t len, const char **attr, data1_node *at)
Definition: d1_read.c:227
data1_node * data1_mk_tag_data_oid(data1_handle dh, data1_node *at, const char *tag, Odr_oid *oid, NMEM nmem)
Definition: d1_read.c:501
int getc_mem(void *fh)
Definition: d1_read.c:920
const char * data1_get_encoding(data1_handle dh, data1_node *n)
Definition: d1_read.c:1039
data1_node * data1_mk_text(data1_handle dh, NMEM mem, const char *buf, data1_node *parent)
Definition: d1_read.c:347
data1_node * data1_mk_tag_data_int(data1_handle dh, data1_node *at, const char *tag, int num, NMEM nmem)
Definition: d1_read.c:494
void data1_set_root(data1_handle dh, data1_node *res, NMEM nmem, const char *name)
Definition: d1_read.c:191
static int ampr(int(*get_byte)(void *fh), void *fh, int *amp)
Definition: d1_read.c:551
static int conv_item(NMEM m, yaz_iconv_t t, WRBUF wrbuf, char *inbuf, size_t inlen)
Definition: d1_read.c:979
data1_element * data1_getelementbytagname(data1_handle dh, data1_absyn *abs, data1_element *parent, const char *tagname)
Definition: d1_absyn.c:312
#define DATA1N_variant
Definition: data1.h:280
data1_absyn * data1_get_absyn(data1_handle dh, const char *name, enum DATA1_XPATH_INDEXING en)
Definition: d1_absyn.c:231
#define DATA1N_comment
Definition: data1.h:282
#define DATA1I_oid
Definition: data1.h:318
#define DATA1N_tag
Definition: data1.h:276
#define DATA1N_data
Definition: data1.h:278
#define d1_isspace(c)
Definition: data1.h:31
#define DATA1N_root
Definition: data1.h:274
char ** data1_get_read_buf(data1_handle dp, int **lenp)
Definition: d1_handle.c:107
#define DATA1N_preprocess
Definition: data1.h:284
#define DATA1_LOCALDATA
Definition: data1.h:338
#define DATA1I_num
Definition: data1.h:316
#define DATA1I_text
Definition: data1.h:314
@ DATA1_XPATH_INDEXING_ENABLE
Definition: data1.h:349
data1_vartype * data1_getvartypebyct(data1_handle dh, data1_varset *set, const char *zclass, const char *type)
Definition: d1_varset.c:30
int data1_is_xmlmode(data1_handle dh)
Definition: d1_handle.c:170
#define DATA1I_xmltext
Definition: data1.h:320
char lbuf[DATA1_LOCALDATA]
Definition: data1.h:339
struct data1_node::@2::@7 preprocess
struct data1_node::@2::@3 root
struct data1_node * parent
Definition: data1.h:343
struct data1_node * child
Definition: data1.h:341
char * tag
Definition: data1.h:296
char * data
Definition: data1.h:307
struct data1_node * next
Definition: data1.h:340
struct data1_node * last_child
Definition: data1.h:342
union data1_node::@2 u
int which
Definition: data1.h:285
struct data1_node::@2::@6 variant
char * value
Definition: data1.h:261
char * name
Definition: data1.h:260
struct data1_xattr * next
Definition: data1.h:262
unsigned short what
Definition: data1.h:263
long zint
Zebra integer.
Definition: util.h:66
#define ZINT_FORMAT
Definition: util.h:72