IDZEBRA  2.1.2
d1_read.c
Go to the documentation of this file.
1 /* This file is part of the Zebra server.
2  Copyright (C) Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
18 */
19 
20 
21 /*
22  * This module reads "loose" SGML and converts it to data1 tree
23  */
24 
25 #if HAVE_CONFIG_H
26 #include <config.h>
27 #endif
28 #include <assert.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 
32 #include <yaz/yaz-util.h>
33 #include <d1_absyn.h>
34 
36 {
37  if (!n)
38  return 0;
39  if (data1_is_xmlmode(dh))
40  {
41  n = n->child;
42  while (n && n->which != DATA1N_tag)
43  n = n->next;
44  }
45  return n;
46 }
47 
48 /*
49  * get the tag which is the immediate parent of this node (this may mean
50  * traversing intermediate things like variants and stuff.
51  */
53 {
54  if (data1_is_xmlmode(dh))
55  {
56  for (; n && n->which != DATA1N_root; n = n->parent)
57  if (n->which == DATA1N_tag && n->parent &&
58  n->parent->which != DATA1N_root)
59  return n;
60  }
61  else
62  {
63  for (; n && n->which != DATA1N_root; n = n->parent)
64  if (n->which == DATA1N_tag)
65  return n;
66  }
67  return 0;
68 }
69 
71 {
72  return data1_mk_node2 (dh, m, DATA1N_root, 0);
73 }
74 
76 {
77  return data1_mk_node2 (dh, m, type, 0);
78 }
79 
80 static void data1_init_node (data1_handle dh, data1_node *r, int type)
81 {
82  r->which = type;
83  switch(type)
84  {
85  case DATA1N_tag:
86  r->u.tag.tag = 0;
87  r->u.tag.element = 0;
88  r->u.tag.no_data_requested = 0;
89  r->u.tag.node_selected = 0;
90  r->u.tag.make_variantlist = 0;
91  r->u.tag.get_bytes = -1;
92  r->u.tag.attributes = 0;
93  break;
94  case DATA1N_root:
95  r->u.root.type = 0;
96  r->u.root.absyn = 0;
97  break;
98  case DATA1N_data:
99  r->u.data.data = 0;
100  r->u.data.len = 0;
101  r->u.data.what = 0;
102  r->u.data.formatted_text = 0;
103  break;
104  case DATA1N_comment:
105  r->u.data.data = 0;
106  r->u.data.len = 0;
107  r->u.data.what = 0;
108  r->u.data.formatted_text = 1;
109  break;
110  case DATA1N_variant:
111  r->u.variant.type = 0;
112  r->u.variant.value = 0;
113  break;
114  case DATA1N_preprocess:
115  r->u.preprocess.target = 0;
116  r->u.preprocess.attributes = 0;
117  break;
118  default:
119  yaz_log (YLOG_WARN, "data_mk_node_type. bad type = %d\n", type);
120  }
121 }
122 
124  data1_node *parent)
125 {
126  data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));
127  r->next = r->child = r->last_child = 0;
128 
129  r->parent = parent;
130  if (!parent)
131  r->root = r;
132  else
133  {
134  r->root = parent->root;
135  if (!parent->child)
136  parent->child = parent->last_child = r;
137  else
138  parent->last_child->next = r;
139  parent->last_child = r;
140  }
141  data1_init_node(dh, r, type);
142  return r;
143 }
144 
145 data1_node *data1_mk_node2 (data1_handle dh, NMEM m, int type,
146  data1_node *parent)
147 {
148  return data1_append_node (dh, m, type, parent);
149 }
150 
152  data1_node *parent)
153 {
154  data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));
155  r->next = r->child = r->last_child = 0;
156 
157  if (!parent)
158  r->root = r;
159  else
160  {
161  r->root = parent->root;
162  r->parent = parent;
163  if (!parent->child)
164  parent->last_child = r;
165  else
166  r->next = parent->child;
167  parent->child = r;
168  }
169  data1_init_node(dh, r, type);
170  return r;
171 }
172 
173 data1_node *data1_mk_root (data1_handle dh, NMEM nmem, const char *name)
174 {
175  data1_absyn *absyn = data1_get_absyn(dh, name, 1);
176  data1_node *res;
177 
178  if (!absyn)
179  {
180  yaz_log(YLOG_WARN, "Unable to acquire abstract syntax " "for '%s'",
181  name);
182  /* It's now OK for a record not to have an absyn */
183  }
184  res = data1_mk_node2 (dh, nmem, DATA1N_root, 0);
185  res->u.root.type = data1_insert_string (dh, res, nmem, name);
186  res->u.root.absyn = absyn;
187  return res;
188 }
189 
191  NMEM nmem, const char *name)
192 {
193  data1_absyn *absyn = data1_get_absyn(
194  dh, name, DATA1_XPATH_INDEXING_ENABLE);
195 
196  res->u.root.type = data1_insert_string (dh, res, nmem, name);
197  res->u.root.absyn = absyn;
198 }
199 
200 void data1_add_attrs(data1_handle dh, NMEM nmem, const char **attr,
201  data1_xattr **p)
202 {
203  while (*p)
204  p = &(*p)->next;
205 
206  while (attr && *attr)
207  {
208  *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p));
209  (*p)->name = nmem_strdup (nmem, *attr++);
210  (*p)->value = nmem_strdup (nmem, *attr++);
211  (*p)->what = DATA1I_text;
212 
213  p = &(*p)->next;
214  }
215  *p = 0;
216 }
217 
219  const char *target,
220  const char **attr, data1_node *at)
221 {
222  return data1_mk_preprocess_n (dh, nmem, target, strlen(target),
223  attr, at);
224 }
225 
227  const char *target, size_t len,
228  const char **attr, data1_node *at)
229 {
230  data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_preprocess, at);
231  res->u.preprocess.target = data1_insert_string_n (dh, res, nmem,
232  target, len);
233 
234  data1_add_attrs(dh, nmem, attr, &res->u.preprocess.attributes);
235  return res;
236 }
237 
239  const char *target,
240  const char **attr, data1_node *at)
241 {
242  return data1_insert_preprocess_n (dh, nmem, target, strlen(target),
243  attr, at);
244 }
245 
247  const char *target, size_t len,
248  const char **attr, data1_node *at)
249 {
250  data1_node *res = data1_insert_node (dh, nmem, DATA1N_preprocess, at);
251  res->u.preprocess.target = data1_insert_string_n (dh, res, nmem,
252  target, len);
253 
254  data1_add_attrs(dh, nmem, attr, &res->u.preprocess.attributes);
255  return res;
256 }
257 
259  const char *tag, size_t len, const char **attr,
260  data1_node *at)
261 {
262  data1_node *partag = get_parent_tag(dh, at);
263  data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_tag, at);
264  data1_element *e = 0;
265 
266  res->u.tag.tag = data1_insert_string_n (dh, res, nmem, tag, len);
267 
268  if (!partag) /* top tag? */
269  e = data1_getelementbytagname (dh, at->root->u.root.absyn,
270  0 /* index as local */,
271  res->u.tag.tag);
272  else
273  {
274  /* only set element for known tags */
275  e = partag->u.tag.element;
276  if (e)
277  e = data1_getelementbytagname (dh, at->root->u.root.absyn,
278  e, res->u.tag.tag);
279  }
280  res->u.tag.element = e;
281  data1_add_attrs(dh, nmem, attr, &res->u.tag.attributes);
282  return res;
283 }
284 
285 void data1_tag_add_attr (data1_handle dh, NMEM nmem,
286  data1_node *res, const char **attr)
287 {
288  if (res->which != DATA1N_tag)
289  return;
290 
291  data1_add_attrs(dh, nmem, attr, &res->u.tag.attributes);
292 }
293 
295  const char *tag, const char **attr, data1_node *at)
296 {
297  return data1_mk_tag_n (dh, nmem, tag, strlen(tag), attr, at);
298 }
299 
301  const char *tag)
302 {
303  if (*tag == '/')
304  {
305  n = data1_get_root_tag (dh, n);
306  if (n)
307  n = n->child;
308  tag++;
309  }
310  for (; n; n = n->next)
311  if (n->which == DATA1N_tag && n->u.tag.tag &&
312  !yaz_matchstr (n->u.tag.tag, tag))
313  {
314  return n;
315  }
316  return 0;
317 }
318 
320  const char *tag, data1_node *at)
321 {
322  data1_node *node = data1_search_tag (dh, at->child, tag);
323  if (!node)
324  node = data1_mk_tag (dh, nmem, tag, 0 /* attr */, at);
325  else
326  node->child = node->last_child = 0;
327  return node;
328 }
329 
331  const char *buf, size_t len, data1_node *parent)
332 {
333  data1_node *res = data1_mk_node2 (dh, mem, DATA1N_data, parent);
334  res->u.data.what = DATA1I_text;
335  res->u.data.len = len;
336 
337  res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len);
338  return res;
339 }
340 
342  const char *buf, size_t len, data1_node *parent)
343 {
344  data1_node *res = data1_mk_text_n (dh, mem, buf, len, parent);
345  res->u.data.formatted_text = 1;
346  return res;
347 }
348 
350  const char *buf, data1_node *parent)
351 {
352  return data1_mk_text_n (dh, mem, buf, strlen(buf), parent);
353 }
354 
356  const char *buf, size_t len,
357  data1_node *parent)
358 {
359  data1_node *res = data1_mk_node2 (dh, mem, DATA1N_comment, parent);
360  res->u.data.what = DATA1I_text;
361  res->u.data.len = len;
362 
363  res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len);
364  return res;
365 }
366 
368  const char *buf, data1_node *parent)
369 {
370  return data1_mk_comment_n (dh, mem, buf, strlen(buf), parent);
371 }
372 
374  NMEM m, const char *str, size_t len)
375 {
376  char *b;
377  if (len >= DATA1_LOCALDATA)
378  b = (char *) nmem_malloc (m, len+1);
379  else
380  b = res->lbuf;
381  memcpy (b, str, len);
382  b[len] = 0;
383  return b;
384 }
385 
387  NMEM m, const char *str)
388 {
389  return data1_insert_string_n (dh, res, m, str, strlen(str));
390 }
391 
393  data1_node *at,
394  const char *tagname, NMEM m,
395  int local_allowed,
396  int insert_mode)
397 {
398  data1_node *root = at->root;
399  data1_node *partag = get_parent_tag (dh, at);
400  data1_element *e = NULL;
401  data1_node *datn = 0;
402  data1_node *tagn = 0;
403 
404  if (!partag)
405  e = data1_getelementbytagname (dh, root->u.root.absyn, 0, tagname);
406  else
407  {
408  e = partag->u.tag.element;
409  if (e)
410  e = data1_getelementbytagname (dh, root->u.root.absyn, e, tagname);
411  }
412  if (local_allowed || e)
413  {
414  if (insert_mode)
415  tagn = data1_insert_node (dh, m, DATA1N_tag, at);
416  else
417  tagn = data1_append_node (dh, m, DATA1N_tag, at);
418  tagn->u.tag.tag = data1_insert_string (dh, tagn, m, tagname);
419  tagn->u.tag.element = e;
420  datn = data1_mk_node2 (dh, m, DATA1N_data, tagn);
421  }
422  return datn;
423 }
424 
426  const char *tagname, NMEM m)
427 {
428  return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0);
429 }
430 
431 
432 /*
433  * Insert a tagged node into the record root as first child of the node at
434  * which should be root or tag itself). Returns pointer to the data node,
435  * which can then be modified.
436  */
438  const char *tagname, NMEM m)
439 {
440  return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1);
441 }
442 
444  data1_node *at, const char *tagname,
445  NMEM m)
446 {
447  return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1);
448 }
449 
451  data1_node *at, const char *tagname,
452  NMEM m)
453 {
454  return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0);
455 }
456 
458  const char *tag, zint num,
459  NMEM nmem)
460 {
461  data1_node *node_data;
462 
463  node_data = data1_mk_tag_data (dh, at, tag, nmem);
464  if (!node_data)
465  return 0;
466  node_data->u.data.what = DATA1I_num;
467  node_data->u.data.data = node_data->lbuf;
468  sprintf (node_data->u.data.data, ZINT_FORMAT, num);
469  node_data->u.data.len = strlen (node_data->u.data.data);
470  return node_data;
471 }
472 
474  const char *tag, int num,
475  NMEM nmem)
476 {
477  return data1_mk_tag_data_zint(dh, at, tag, num, nmem);
478 }
479 
481  const char *tag, Odr_oid *oid,
482  NMEM nmem)
483 {
484  data1_node *node_data;
485  char str[128], *p = str;
486  Odr_oid *ii;
487 
488  node_data = data1_mk_tag_data (dh, at, tag, nmem);
489  if (!node_data)
490  return 0;
491 
492  for (ii = oid; *ii >= 0; ii++)
493  {
494  if (ii != oid)
495  *p++ = '.';
496  sprintf (p, "%d", *ii);
497  p += strlen (p);
498  }
499  node_data->u.data.what = DATA1I_oid;
500  node_data->u.data.len = strlen (str);
501  node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str);
502  return node_data;
503 }
504 
505 
507  const char *tag, const char *str,
508  NMEM nmem)
509 {
510  data1_node *node_data;
511 
512  node_data = data1_mk_tag_data (dh, at, tag, nmem);
513  if (!node_data)
514  return 0;
515  node_data->u.data.what = DATA1I_text;
516  node_data->u.data.len = strlen (str);
517  node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str);
518  return node_data;
519 }
520 
521 
523  const char *tag, const char *str,
524  NMEM nmem)
525 {
526  data1_node *node = data1_search_tag (dh, at->child, tag);
527  if (!node)
528  return data1_mk_tag_data_text (dh, at, tag, str, nmem);
529  else
530  {
531  data1_node *node_data = node->child;
532  node_data->u.data.what = DATA1I_text;
533  node_data->u.data.len = strlen (str);
534  node_data->u.data.data = data1_insert_string (dh, node_data,
535  nmem, str);
536  node_data->child = node_data->last_child = 0;
537  return node_data;
538  }
539 }
540 
541 static int ampr (int (*get_byte)(void *fh), void *fh, int *amp)
542 {
543 #if 1
544  int c = (*get_byte)(fh);
545  *amp = 0;
546  return c;
547 #else
548  int c = (*get_byte)(fh);
549  *amp = 0;
550  if (c == '&')
551  {
552  char ent[20];
553  int i = 0;
554 
555  while (1)
556  {
557  c = (*get_byte)(fh);
558  if (c == ';')
559  {
560  ent[i] = 0;
561 
562  c = ' ';
563  if (!strcmp (ent, "quot"))
564  c = '"';
565  if (!strcmp (ent, "apos"))
566  c = '\'';
567  if (!strcmp (ent, "gt"))
568  c = '>';
569  if (!strcmp (ent, "lt"))
570  c = '<';
571  if (!strcmp (ent, "amp"))
572  c = '&';
573  *amp = 1;
574  break;
575  }
576  else if (c == 0 || d1_isspace(c))
577  break;
578  if (i < 19)
579  ent[i++] = c;
580  }
581  }
582  return c;
583 #endif
584 }
585 
587  int (*get_byte)(void *fh), void *fh,
588  WRBUF wrbuf, int *ch, int *amp)
589 {
590  data1_xattr *p_first = 0;
591  data1_xattr **pp = &p_first;
592  int c = *ch;
593  for (;;)
594  {
595  data1_xattr *p;
596  while (*amp || (c && d1_isspace(c)))
597  c = ampr (get_byte, fh, amp);
598  if (*amp == 0 && (c == 0 || c == '>' || c == '/'))
599  break;
600  *pp = p = (data1_xattr *) nmem_malloc (m, sizeof(*p));
601  p->next = 0;
602  pp = &p->next;
603  p->value = 0;
604  p->what = DATA1I_xmltext;
605 
606  wrbuf_rewind(wrbuf);
607  while (c && c != '=' && c != '>' && c != '/' && !d1_isspace(c))
608  {
609  wrbuf_putc (wrbuf, c);
610  c = ampr (get_byte, fh, amp);
611  }
612  p->name = nmem_strdup (m, wrbuf_cstr(wrbuf));
613  if (c == '=')
614  {
615  c = ampr (get_byte, fh, amp);
616  if (*amp == 0 && c == '"')
617  {
618  c = ampr (get_byte, fh, amp);
619  wrbuf_rewind(wrbuf);
620  while (*amp || (c && c != '"'))
621  {
622  wrbuf_putc (wrbuf, c);
623  c = ampr (get_byte, fh, amp);
624  }
625  if (c)
626  c = ampr (get_byte, fh, amp);
627  }
628  else if (*amp == 0 && c == '\'')
629  {
630  c = ampr (get_byte, fh, amp);
631  wrbuf_rewind(wrbuf);
632  while (*amp || (c && c != '\''))
633  {
634  wrbuf_putc (wrbuf, c);
635  c = ampr (get_byte, fh, amp);
636  }
637  if (c)
638  c = ampr (get_byte, fh, amp);
639  }
640  else
641  {
642  wrbuf_rewind(wrbuf);
643  while (*amp || (c && c != '>' && c != '/'))
644  {
645  wrbuf_putc (wrbuf, c);
646  c = ampr (get_byte, fh, amp);
647  }
648  }
649  p->value = nmem_strdup(m, wrbuf_cstr(wrbuf));
650  }
651  }
652  *ch = c;
653  return p_first;
654 }
655 
656 /*
657  * Ugh. Sometimes functions just grow and grow on you. This one reads a
658  * 'node' and its children.
659  */
661  int (*get_byte)(void *fh), void *fh, WRBUF wrbuf)
662 {
663  data1_node *d1_stack[256];
664  data1_node *res;
665  int c, amp;
666  int level = 0;
667  int line = 1;
668 
669  d1_stack[level] = 0;
670  c = ampr (get_byte, fh, &amp);
671  while (c != '\0')
672  {
673  data1_node *parent = level ? d1_stack[level-1] : 0;
674 
675  if (amp == 0 && c == '<') /* beginning of tag */
676  {
677  data1_xattr *xattr;
678 
679  char tag[256];
680  int null_tag = 0;
681  int end_tag = 0;
682  size_t i = 0;
683 
684  c = ampr (get_byte, fh, &amp);
685  if (amp == 0 && c == '/')
686  {
687  end_tag = 1;
688  c = ampr (get_byte, fh, &amp);
689  }
690  else if (amp == 0 && c == '?')
691  {
692  int quote_mode = 0;
693  while ((c = ampr(get_byte, fh, &amp)))
694  {
695  if (amp)
696  continue;
697  if (quote_mode == 0)
698  {
699  if (c == '"')
700  quote_mode = c;
701  else if (c == '\'')
702  quote_mode = c;
703  else if (c == '>')
704  {
705  c = ampr(get_byte, fh, &amp);
706  break;
707  }
708  }
709  else
710  {
711  if (amp == 0 && c == quote_mode)
712  quote_mode = 0;
713  }
714  }
715  continue;
716  }
717  else if (amp == 0 && c == '!')
718  {
719  int c0, amp0;
720 
721  wrbuf_rewind(wrbuf);
722 
723  c0 = ampr (get_byte, fh, &amp0);
724  if (amp0 == 0 && c0 == '\0')
725  break;
726  c = ampr (get_byte, fh, &amp);
727 
728  if (amp0 == 0 && c0 == '-' && amp == 0 && c == '-')
729  {
730  /* COMMENT: <!-- ... --> */
731  int no_dash = 0;
732 
733  c = ampr (get_byte, fh, &amp);
734  while (amp || c)
735  {
736  if (amp == 0 && c == '-')
737  no_dash++;
738  else if (amp == 0 && c == '>' && no_dash >= 2)
739  {
740  if (level > 0)
741  d1_stack[level] =
743  dh, m,
744  wrbuf_buf(wrbuf), wrbuf_len(wrbuf)-2,
745  d1_stack[level-1]);
746  c = ampr (get_byte, fh, &amp); /* skip > */
747  break;
748  }
749  else
750  no_dash = 0;
751  wrbuf_putc (wrbuf, c);
752  c = ampr (get_byte, fh, &amp);
753  }
754  continue;
755  }
756  else
757  { /* DIRECTIVE: <! .. > */
758 
759  int blevel = 0;
760  while (amp || c)
761  {
762  if (amp == 0 && c == '>' && blevel == 0)
763  {
764  c = ampr (get_byte, fh, &amp);
765  break;
766  }
767  if (amp == 0 && c == '[')
768  blevel++;
769  if (amp == 0 && c == ']' && blevel > 0)
770  blevel--;
771  c = ampr (get_byte, fh, &amp);
772  }
773  continue;
774  }
775  }
776  while (amp || (c && c != '>' && c != '/' && !d1_isspace(c)))
777  {
778  if (i < (sizeof(tag)-1))
779  tag[i++] = c;
780  c = ampr (get_byte, fh, &amp);
781  }
782  tag[i] = '\0';
783  xattr = data1_read_xattr (dh, m, get_byte, fh, wrbuf, &c, &amp);
784  if (amp == 0 && c == '/')
785  { /* <tag attrs/> or <tag/> */
786  null_tag = 1;
787  c = ampr (get_byte, fh, &amp);
788  }
789  if (amp || c != '>')
790  {
791  yaz_log(YLOG_WARN, "d1: %d: Malformed tag", line);
792  return 0;
793  }
794  else
795  c = ampr (get_byte, fh, &amp);
796 
797  /* End tag? */
798  if (end_tag)
799  {
800  if (*tag == '\0')
801  --level; /* </> */
802  else
803  { /* </tag> */
804  int i = level;
805  while (i > 0)
806  {
807  parent = d1_stack[--i];
808  if ((parent->which == DATA1N_root &&
809  !strcmp(tag, parent->u.root.type)) ||
810  (parent->which == DATA1N_tag &&
811  !strcmp(tag, parent->u.tag.tag)))
812  {
813  level = i;
814  break;
815  }
816  }
817  if (i != level)
818  {
819  yaz_log (YLOG_WARN, "%d: no begin tag for %s",
820  line, tag);
821  break;
822  }
823  }
824  if (data1_is_xmlmode(dh))
825  {
826  if (level <= 1)
827  return d1_stack[0];
828  }
829  else
830  {
831  if (level <= 0)
832  return d1_stack[0];
833  }
834  continue;
835  }
836  else if (!strcmp(tag, "var")
837  && xattr && xattr->next && xattr->next->next
838  && xattr->value == 0
839  && xattr->next->value == 0
840  && xattr->next->next->value == 0)
841  {
842  /* <var class type value> */
843  const char *tclass = xattr->name;
844  const char *type = xattr->next->name;
845  const char *value = xattr->next->name;
846  data1_vartype *tp;
847 
848  yaz_log(YLOG_LOG, "Variant class=%s type=%s value=%s",
849  tclass, type, value);
850  if (!(tp =
852  parent->root->u.root.absyn->varset,
853  tclass, type)))
854  continue;
855  /*
856  * If we're the first variant in this group, create a parent
857  * variant, and insert it before the current variant.
858  */
859  if (parent->which != DATA1N_variant)
860  {
861  res = data1_mk_node2 (dh, m, DATA1N_variant, parent);
862  }
863  else
864  {
865  /*
866  * now determine if one of our ancestor triples is of
867  * same type. If so, we break here.
868  */
869  int i;
870  for (i = level-1; d1_stack[i]->which==DATA1N_variant; --i)
871  if (d1_stack[i]->u.variant.type == tp)
872  {
873  level = i;
874  break;
875  }
876  res = data1_mk_node2 (dh, m, DATA1N_variant, parent);
877  res->u.variant.type = tp;
878  res->u.variant.value =
879  data1_insert_string (dh, res, m, value);
880  }
881  }
882  else
883  {
884 
885  /* tag .. acquire our element in the abstract syntax */
886  if (level == 0)
887  {
888  parent = data1_mk_root (dh, m, tag);
889  res = d1_stack[level] = parent;
890 
891  if (data1_is_xmlmode(dh))
892  {
893  level++;
894  res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent);
895  res->u.tag.attributes = xattr;
896  }
897  }
898  else
899  {
900  res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent);
901  res->u.tag.attributes = xattr;
902  }
903  }
904  d1_stack[level] = res;
905  d1_stack[level+1] = 0;
906  if (level < 250 && !null_tag)
907  ++level;
908  }
909  else /* != '<'... this is a body of text */
910  {
911  int len;
912 
913  if (level == 0)
914  {
915  c = ampr (get_byte, fh, &amp);
916  continue;
917  }
918  res = data1_mk_node2 (dh, m, DATA1N_data, parent);
919  res->u.data.what = DATA1I_xmltext;
920  res->u.data.formatted_text = 0;
921  d1_stack[level] = res;
922 
923  wrbuf_rewind(wrbuf);
924 
925  while (amp || (c && c != '<'))
926  {
927  wrbuf_putc (wrbuf, c);
928  c = ampr (get_byte, fh, &amp);
929  }
930  len = wrbuf_len(wrbuf);
931 
932  /* use local buffer of nmem if too large */
933  if (len >= DATA1_LOCALDATA)
934  res->u.data.data = (char*) nmem_malloc (m, len);
935  else
936  res->u.data.data = res->lbuf;
937 
938  if (len)
939  memcpy (res->u.data.data, wrbuf_buf(wrbuf), len);
940  else
941  res->u.data.data = 0;
942  res->u.data.len = len;
943  }
944  }
945  return 0;
946 }
947 
948 int getc_mem (void *fh)
949 {
950  const char **p = (const char **) fh;
951  if (**p)
952  return *(*p)++;
953  return 0;
954 }
955 
956 data1_node *data1_read_node (data1_handle dh, const char **buf, NMEM m)
957 {
958  WRBUF wrbuf = wrbuf_alloc();
959  data1_node *node;
960 
961  node = data1_read_nodex(dh, m, getc_mem, (void *) (buf), wrbuf);
962  wrbuf_destroy(wrbuf);
963  return node;
964 }
965 
966 /*
967  * Read a record in the native syntax.
968  */
970  int (*rf)(void *, char *, size_t), void *fh,
971  NMEM m)
972 {
973  int *size;
974  char **buf = data1_get_read_buf (dh, &size);
975  const char *bp;
976  int rd = 0, res;
977 
978  if (!*buf)
979  *buf = (char *)xmalloc(*size = 4096);
980 
981  for (;;)
982  {
983  if (rd + 2048 >= *size && !(*buf =(char *)xrealloc(*buf, *size *= 2)))
984  abort();
985  if ((res = (*rf)(fh, *buf + rd, 2048)) <= 0)
986  {
987  if (!res)
988  {
989  bp = *buf;
990  (*buf)[rd] = '\0';
991  return data1_read_node(dh, &bp, m);
992  }
993  else
994  return 0;
995  }
996  rd += res;
997  }
998 }
999 
1000 data1_node *data1_read_sgml (data1_handle dh, NMEM m, const char *buf)
1001 {
1002  const char *bp = buf;
1003  return data1_read_node (dh, &bp, m);
1004 }
1005 
1006 
1007 static int conv_item(NMEM m, yaz_iconv_t t,
1008  WRBUF wrbuf, char *inbuf, size_t inlen)
1009 {
1010  wrbuf_rewind(wrbuf);
1011  wrbuf_iconv_write(wrbuf, t, inbuf, inlen);
1012  wrbuf_iconv_reset(wrbuf, t);
1013  return 0;
1014 }
1015 
1016 static void data1_iconv_s (data1_handle dh, NMEM m, data1_node *n,
1017  yaz_iconv_t t, WRBUF wrbuf, const char *tocode)
1018 {
1019  for (; n; n = n->next)
1020  {
1021  switch (n->which)
1022  {
1023  case DATA1N_data:
1024  case DATA1N_comment:
1025  if (conv_item (m, t, wrbuf, n->u.data.data, n->u.data.len) == 0)
1026  {
1027  n->u.data.data =
1028  data1_insert_string_n (dh, n, m, wrbuf->buf,
1029  wrbuf->pos);
1030  n->u.data.len = wrbuf->pos;
1031  }
1032  break;
1033  case DATA1N_tag:
1034  if (conv_item (m, t, wrbuf, n->u.tag.tag, strlen(n->u.tag.tag))
1035  == 0)
1036  {
1037  n->u.tag.tag =
1038  data1_insert_string_n (dh, n, m,
1039  wrbuf->buf, wrbuf->pos);
1040  }
1041  if (n->u.tag.attributes)
1042  {
1043  data1_xattr *p;
1044  for (p = n->u.tag.attributes; p; p = p->next)
1045  {
1046  if (p->value &&
1047  conv_item(m, t, wrbuf, p->value, strlen(p->value))
1048  == 0)
1049  {
1050  p->value = nmem_strdup(m, wrbuf_cstr(wrbuf));
1051  }
1052  }
1053  }
1054  break;
1055  case DATA1N_preprocess:
1056  if (strcmp(n->u.preprocess.target, "xml") == 0)
1057  {
1058  data1_xattr *p = n->u.preprocess.attributes;
1059  for (; p; p = p->next)
1060  if (strcmp (p->name, "encoding") == 0)
1061  p->value = nmem_strdup (m, tocode);
1062  }
1063  break;
1064  }
1065  data1_iconv_s (dh, m, n->child, t, wrbuf, tocode);
1066  }
1067 }
1068 
1070 {
1071  /* see if we have an xml header that specifies encoding */
1072  if (n && n->child && n->child->which == DATA1N_preprocess &&
1073  strcmp (n->child->u.preprocess.target, "xml") == 0)
1074  {
1075  data1_xattr *xp = n->child->u.preprocess.attributes;
1076  for (; xp; xp = xp->next)
1077  if (strcmp (xp->name, "encoding") == 0)
1078  return xp->value;
1079  }
1080  /* no encoding in header, so see if "encoding" was specified for abs */
1081  if (n && n->which == DATA1N_root &&
1082  n->u.root.absyn && n->u.root.absyn->encoding)
1083  return n->u.root.absyn->encoding;
1084  /* none of above, return a hard coded default */
1085  return "ISO-8859-1";
1086 }
1087 
1089  const char *tocode,
1090  const char *fromcode)
1091 {
1092  if (yaz_matchstr (tocode, fromcode))
1093  {
1094  WRBUF wrbuf = wrbuf_alloc();
1095  yaz_iconv_t t = yaz_iconv_open(tocode, fromcode);
1096  if (!t)
1097  {
1098  wrbuf_destroy(wrbuf);
1099  return -1;
1100  }
1101  data1_iconv_s(dh, m, n, t, wrbuf, tocode);
1102  yaz_iconv_close(t);
1103  wrbuf_destroy(wrbuf);
1104  }
1105  return 0;
1106 }
1107 
1109 {
1110  for (; n; n = n->next)
1111  {
1112  if (n->which == DATA1N_data)
1113  {
1114 
1115  int sz = n->u.data.len;
1116  const char *ndata = n->u.data.data;
1117  int off = 0;
1118 
1119  for (off = 0; off < sz; off++)
1120  if (!d1_isspace(ndata[off]))
1121  break;
1122  sz = sz - off;
1123  ndata += off;
1124 
1125  while (sz && d1_isspace(ndata[sz - 1]))
1126  sz--;
1127 
1128  n->u.data.data = nmem_malloc(m, sz);
1129  n->u.data.len = sz;
1130  memcpy(n->u.data.data, ndata, sz);
1131 
1132  }
1133  data1_chop_text(dh, m, n->child);
1134  }
1135 }
1136 
1138 {
1139  for (; n; n = n->next)
1140  {
1141  if (n->which == DATA1N_data && n->next &&
1142  n->next->which == DATA1N_data)
1143  {
1144  int sz = 0;
1145  int off = 0;
1146  char *ndata;
1147  data1_node *np;
1148  for (np = n; np && np->which == DATA1N_data; np=np->next)
1149  sz += np->u.data.len;
1150  ndata = nmem_malloc(m, sz);
1151  for (np = n; np && np->which == DATA1N_data; np=np->next)
1152  {
1153  memcpy(ndata+off, np->u.data.data, np->u.data.len);
1154  off += np->u.data.len;
1155  }
1156  n->u.data.data = ndata;
1157  n->u.data.len = sz;
1158  n->next = np;
1159  if (!np && n->parent)
1160  n->parent->last_child = n;
1161 
1162  }
1163  data1_concat_text(dh, m, n->child);
1164  }
1165 }
1166 
1167 /*
1168  * Local variables:
1169  * c-basic-offset: 4
1170  * c-file-style: "Stroustrup"
1171  * indent-tabs-mode: nil
1172  * End:
1173  * vim: shiftwidth=4 tabstop=8 expandtab
1174  */
1175 
int getc_mem(void *fh)
Definition: d1_read.c:948
data1_node * data1_insert_node(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition: d1_read.c:151
data1_node * data1_mk_tag_n(data1_handle dh, NMEM nmem, const char *tag, size_t len, const char **attr, data1_node *at)
Definition: d1_read.c:258
#define DATA1N_tag
Definition: data1.h:276
char * data1_insert_string(data1_handle dh, data1_node *res, NMEM m, const char *str)
Definition: d1_read.c:386
data1_node * data1_mk_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition: d1_read.c:218
data1_absyn * data1_get_absyn(data1_handle dh, const char *name, enum DATA1_XPATH_INDEXING en)
Definition: d1_absyn.c:230
char ** data1_get_read_buf(data1_handle dp, int **lenp)
Definition: d1_handle.c:111
data1_node * data1_get_root_tag(data1_handle dh, data1_node *n)
Definition: d1_read.c:35
static int conv_item(NMEM m, yaz_iconv_t t, WRBUF wrbuf, char *inbuf, size_t inlen)
Definition: d1_read.c:1007
void data1_tag_add_attr(data1_handle dh, NMEM nmem, data1_node *res, const char **attr)
Definition: d1_read.c:285
char lbuf[DATA1_LOCALDATA]
Definition: data1.h:339
#define DATA1N_root
Definition: data1.h:274
data1_node * data1_read_sgml(data1_handle dh, NMEM m, const char *buf)
Definition: d1_read.c:1000
data1_node * data1_add_taggeddata(data1_handle dh, data1_node *root, data1_node *at, const char *tagname, NMEM m)
Definition: d1_read.c:450
void data1_chop_text(data1_handle dh, NMEM m, data1_node *n)
Definition: d1_read.c:1108
data1_node * data1_mk_tag_data_zint(data1_handle dh, data1_node *at, const char *tag, zint num, NMEM nmem)
Definition: d1_read.c:457
static int ampr(int(*get_byte)(void *fh), void *fh, int *amp)
Definition: d1_read.c:541
data1_node * data1_insert_preprocess_n(data1_handle dh, NMEM nmem, const char *target, size_t len, const char **attr, data1_node *at)
Definition: d1_read.c:246
data1_node * data1_mk_tag_data_wd(data1_handle dh, data1_node *at, const char *tagname, NMEM m)
Definition: d1_read.c:437
char * name
Definition: data1.h:260
int data1_iconv(data1_handle dh, NMEM m, data1_node *n, const char *tocode, const char *fromcode)
Definition: d1_read.c:1088
data1_vartype * data1_getvartypebyct(data1_handle dh, data1_varset *set, const char *zclass, const char *type)
Definition: d1_varset.c:30
data1_node * data1_mk_node(data1_handle dh, NMEM m)
Definition: d1_read.c:70
static void data1_init_node(data1_handle dh, data1_node *r, int type)
Definition: d1_read.c:80
data1_node * data1_append_node(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition: d1_read.c:123
struct data1_xattr * next
Definition: data1.h:262
data1_node * data1_mk_text_n(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:330
data1_node * data1_mk_comment_n(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:355
struct data1_node * last_child
Definition: data1.h:342
#define DATA1I_oid
Definition: data1.h:318
char * value
Definition: data1.h:261
data1_node * data1_insert_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition: d1_read.c:238
data1_node * data1_mk_tag_data_int(data1_handle dh, data1_node *at, const char *tag, int num, NMEM nmem)
Definition: d1_read.c:473
#define DATA1N_comment
Definition: data1.h:282
data1_node * data1_mk_node_type(data1_handle dh, NMEM m, int type)
Definition: d1_read.c:75
data1_node * get_parent_tag(data1_handle dh, data1_node *n)
Definition: d1_read.c:52
data1_node * data1_mk_root(data1_handle dh, NMEM nmem, const char *name)
Definition: d1_read.c:173
#define DATA1I_text
Definition: data1.h:314
void data1_concat_text(data1_handle dh, NMEM m, data1_node *n)
Definition: d1_read.c:1137
data1_node * data1_read_node(data1_handle dh, const char **buf, NMEM m)
Definition: d1_read.c:956
void data1_add_attrs(data1_handle dh, NMEM nmem, const char **attr, data1_xattr **p)
Definition: d1_read.c:200
struct data1_node::@2::@7 preprocess
#define DATA1N_preprocess
Definition: data1.h:284
char * tag
Definition: data1.h:296
union data1_node::@2 u
struct data1_node::@2::@3 root
data1_node * data1_mk_preprocess_n(data1_handle dh, NMEM nmem, const char *target, size_t len, const char **attr, data1_node *at)
Definition: d1_read.c:226
struct data1_node * child
Definition: data1.h:341
struct data1_node::@2::@6 variant
data1_node * data1_mk_tag_data_oid(data1_handle dh, data1_node *at, const char *tag, Odr_oid *oid, NMEM nmem)
Definition: d1_read.c:480
data1_node * data1_mk_tag_data_text(data1_handle dh, data1_node *at, const char *tag, const char *str, NMEM nmem)
Definition: d1_read.c:506
data1_node * data1_mk_tag_data(data1_handle dh, data1_node *at, const char *tagname, NMEM m)
Definition: d1_read.c:425
#define DATA1I_num
Definition: data1.h:316
const char * data1_get_encoding(data1_handle dh, data1_node *n)
Definition: d1_read.c:1069
int which
Definition: data1.h:285
struct data1_node * parent
Definition: data1.h:343
data1_element * data1_getelementbytagname(data1_handle dh, data1_absyn *abs, data1_element *parent, const char *tagname)
Definition: d1_absyn.c:311
data1_node * data1_read_nodex(data1_handle dh, NMEM m, int(*get_byte)(void *fh), void *fh, WRBUF wrbuf)
Definition: d1_read.c:660
char * data1_insert_string_n(data1_handle dh, data1_node *res, NMEM m, const char *str, size_t len)
Definition: d1_read.c:373
unsigned short what
Definition: data1.h:263
long zint
Zebra integer.
Definition: util.h:66
static void data1_iconv_s(data1_handle dh, NMEM m, data1_node *n, yaz_iconv_t t, WRBUF wrbuf, const char *tocode)
Definition: d1_read.c:1016
data1_node * data1_mk_node2(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition: d1_read.c:145
struct data1_node * next
Definition: data1.h:340
char * data
Definition: data1.h:307
#define DATA1N_variant
Definition: data1.h:280
data1_node * data1_mk_comment(data1_handle dh, NMEM mem, const char *buf, data1_node *parent)
Definition: d1_read.c:367
int data1_is_xmlmode(data1_handle dh)
Definition: d1_handle.c:174
data1_node * data1_insert_taggeddata(data1_handle dh, data1_node *root, data1_node *at, const char *tagname, NMEM m)
Definition: d1_read.c:443
#define DATA1I_xmltext
Definition: data1.h:320
data1_node * data1_mk_text_nf(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition: d1_read.c:341
data1_node * data1_mk_tag(data1_handle dh, NMEM nmem, const char *tag, const char **attr, data1_node *at)
Definition: d1_read.c:294
data1_node * data1_read_record(data1_handle dh, int(*rf)(void *, char *, size_t), void *fh, NMEM m)
Definition: d1_read.c:969
data1_node * data1_mk_text(data1_handle dh, NMEM mem, const char *buf, data1_node *parent)
Definition: d1_read.c:349
data1_node * data1_search_tag(data1_handle dh, data1_node *n, const char *tag)
Definition: d1_read.c:300
struct record_index_entry ent
static data1_node * data1_add_insert_taggeddata(data1_handle dh, data1_node *at, const char *tagname, NMEM m, int local_allowed, int insert_mode)
Definition: d1_read.c:392
data1_xattr * data1_read_xattr(data1_handle dh, NMEM m, int(*get_byte)(void *fh), void *fh, WRBUF wrbuf, int *ch, int *amp)
Definition: d1_read.c:586
void data1_set_root(data1_handle dh, data1_node *res, NMEM nmem, const char *name)
Definition: d1_read.c:190
data1_node * data1_mk_tag_data_text_uni(data1_handle dh, data1_node *at, const char *tag, const char *str, NMEM nmem)
Definition: d1_read.c:522
#define DATA1N_data
Definition: data1.h:278
#define DATA1_LOCALDATA
Definition: data1.h:338
#define d1_isspace(c)
Definition: data1.h:31
data1_node * data1_mk_tag_uni(data1_handle dh, NMEM nmem, const char *tag, data1_node *at)
Definition: d1_read.c:319
#define ZINT_FORMAT
Definition: util.h:72