YAZ  5.34.0
iconv_decode_marc8.c
Go to the documentation of this file.
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
13 #if HAVE_CONFIG_H
14 #include <config.h>
15 #endif
16 
17 #include <assert.h>
18 #include <errno.h>
19 #include <string.h>
20 
21 #include <yaz/xmalloc.h>
22 #include "iconv-p.h"
23 
24 struct decoder_data {
25  int g0_mode;
26  int g1_mode;
27 
28  int comb_offset;
29  int comb_size;
30  unsigned long comb_x[8];
31  size_t comb_no_read[8];
33 };
34 
47 
48 
49 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
50  struct decoder_data *data,
51  unsigned char *inp,
52  size_t inbytesleft, size_t *no_read,
53  int *comb);
54 
55 static unsigned long read_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
56  unsigned char *inp,
57  size_t inbytesleft, size_t *no_read)
58 {
59  struct decoder_data *data = (struct decoder_data *) d->data;
60  unsigned long x;
61  if (data->comb_offset < data->comb_size)
62  {
63  *no_read = data->comb_no_read[data->comb_offset];
64  x = data->comb_x[data->comb_offset];
65 
66  /* special case for double-diacritic combining characters,
67  INVERTED BREVE and DOUBLE TILDE.
68  We'll increment the no_read counter by 1, since we want to skip over
69  the processing of the closing ligature character
70  */
71  /* this code is no longer necessary.. our handlers code in
72  yaz_marc8_?_conv (generated by charconv.tcl) now returns
73  0 and no_read=1 when a sequence does not match the input.
74  The SECOND HALFs in codetables.xml produces a non-existant
75  entry in the conversion trie.. Hence when met, the input byte is
76  skipped as it should (in yaz_iconv)
77  */
78 #if 0
79  if (x == 0x0361 || x == 0x0360)
80  *no_read += 1;
81 #endif
82  data->comb_offset++;
83  return x;
84  }
85 
86  data->comb_offset = 0;
87  for (data->comb_size = 0; data->comb_size < 8; data->comb_size++)
88  {
89  int comb = 0;
90 
91  if (inbytesleft == 0 && data->comb_size)
92  {
94  x = 0;
95  *no_read = 0;
96  break;
97  }
98  x = yaz_read_marc8_comb(cd, data, inp, inbytesleft, no_read, &comb);
99  if (!comb || !x)
100  break;
101  data->comb_x[data->comb_size] = x;
102  data->comb_no_read[data->comb_size] = *no_read;
103  inp += *no_read;
104  inbytesleft = inbytesleft - *no_read;
105  }
106  return x;
107 }
108 
109 static unsigned long read_marc8s(yaz_iconv_t cd, yaz_iconv_decoder_t d,
110  unsigned char *inp,
111  size_t inbytesleft, size_t *no_read)
112 {
113  struct decoder_data *data = (struct decoder_data *) d->data;
114  unsigned long x = read_marc8(cd, d, inp, inbytesleft, no_read);
115  if (x && data->comb_size == 1)
116  {
117  if (yaz_iso_8859_1_lookup_x12(x, data->comb_x[0], &x))
118  {
119  *no_read += data->comb_no_read[0];
120  data->comb_size = 0;
121  }
122  }
123  return x;
124 }
125 
126 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd,
127  struct decoder_data *data,
128  unsigned char *inp,
129  size_t inbytesleft, size_t *no_read,
130  int *comb)
131 {
132  *no_read = 0;
133  while (inbytesleft > 0 && *inp == 27)
134  {
135  int *modep = &data->g0_mode;
136  size_t inbytesleft0 = inbytesleft;
137 
138  inbytesleft--;
139  inp++;
140  if (inbytesleft == 0)
141  goto incomplete;
142  if (*inp == '$') /* set with multiple bytes */
143  {
144  inbytesleft--;
145  inp++;
146  }
147  if (inbytesleft == 0)
148  goto incomplete;
149  if (*inp == '(' || *inp == ',') /* G0 */
150  {
151  inbytesleft--;
152  inp++;
153  }
154  else if (*inp == ')' || *inp == '-') /* G1 */
155  {
156  inbytesleft--;
157  inp++;
158  modep = &data->g1_mode;
159  }
160  if (inbytesleft == 0)
161  goto incomplete;
162  if (*inp == '!') /* ANSEL is a special case */
163  {
164  inbytesleft--;
165  inp++;
166  }
167  if (inbytesleft == 0)
168  goto incomplete;
169  *modep = *inp++; /* Final character */
170  inbytesleft--;
171 
172  (*no_read) += inbytesleft0 - inbytesleft;
173  }
174  if (inbytesleft == 0)
175  return 0;
176  else if (*inp == ' ')
177  {
178  *no_read += 1;
179  return ' ';
180  }
181  else if (*inp < ' ' && data->control_mode)
182  {
183  *no_read += 1;
184  return *inp;
185  }
186  else
187  {
188  unsigned long x;
189  size_t no_read_sub = 0;
190  int mode = *inp < 128 ? data->g0_mode : data->g1_mode;
191  *comb = 0;
192 
193  switch(mode)
194  {
195  case 'B': /* Basic ASCII */
196  case 's': /* ASCII */
197  x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
198  break;
199  case 'E': /* ANSEL */
200  x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
201  break;
202  case 'g': /* Greek */
203  x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
204  break;
205  case 'b': /* Subscripts */
206  x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
207  break;
208  case 'p': /* Superscripts */
209  x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
210  break;
211  case '2': /* Basic Hebrew */
212  x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
213  break;
214  case 'N': /* Basic Cyrillic */
215  x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
216  break;
217  case 'Q': /* Extended Cyrillic */
218  x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
219  break;
220  case '3': /* Basic Arabic */
221  x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
222  break;
223  case '4': /* Extended Arabic */
224  x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
225  break;
226  case 'S': /* Greek */
227  x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
228  break;
229  case '1': /* Chinese, Japanese, Korean (EACC) */
230  x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
231  break;
232  default:
233  *no_read = 0;
235  return 0;
236  }
237  *no_read += no_read_sub;
238  return x;
239  }
240 incomplete:
241  *no_read = 0;
243  return 0;
244 }
245 
246 
248  unsigned char *inp,
249  size_t inbytesleft, size_t *no_read)
250 {
251  struct decoder_data *data = (struct decoder_data *) d->data;
252  data->g0_mode = 'B';
253  data->g1_mode = 'E';
254  data->comb_offset = data->comb_size = 0;
255  data->control_mode = 0;
256  return 0;
257 }
258 
260  unsigned char *inp,
261  size_t inbytesleft, size_t *no_read)
262 {
263  struct decoder_data *data = (struct decoder_data *) d->data;
264 
265  init_marc8(cd, d, inp, inbytesleft, no_read);
266  data->control_mode = 1;
267  return 0;
268 }
269 
271 {
272  struct decoder_data *data = (struct decoder_data *) d->data;
273  xfree(data);
274 }
275 
278 {
279  if (!yaz_matchstr(fromcode, "MARC8") || !yaz_matchstr(fromcode, "ANSEL"))
280  {
281  d->read_handle = read_marc8;
282  d->init_handle = init_marc8;
283  }
284  else if (!yaz_matchstr(fromcode, "MARC8s"))
285  {
287  d->init_handle = init_marc8;
288  }
289  else if (!yaz_matchstr(fromcode, "MARC8c"))
290  {
291  d->read_handle = read_marc8;
293  }
294  else
295  return 0;
296  {
297  struct decoder_data *data = (struct decoder_data *)
298  xmalloc(sizeof(*data));
299  d->data = data;
301  }
302  return d;
303 }
304 
305 
306 /*
307  * Local variables:
308  * c-basic-offset: 4
309  * c-file-style: "Stroustrup"
310  * indent-tabs-mode: nil
311  * End:
312  * vim: shiftwidth=4 tabstop=8 expandtab
313  */
314 
Header for errno utilities.
Internal header for iconv.
int yaz_iso_8859_1_lookup_x12(unsigned long x1, unsigned long x2, unsigned long *y)
unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining, unsigned mask, int boffset)
Definition: iconv-p.h:70
void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
Definition: siconv.c:298
yaz_conv_func_t yaz_marc8_62_conv
yaz_conv_func_t yaz_marc8_33_conv
yaz_conv_func_t yaz_marc8_51_conv
yaz_conv_func_t yaz_marc8_31_conv
static size_t init_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read)
static size_t init_marc8c(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read)
yaz_conv_func_t yaz_marc8_34_conv
yaz_conv_func_t yaz_marc8_53_conv
yaz_conv_func_t yaz_marc8_70_conv
static unsigned long read_marc8s(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read)
static unsigned long read_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read)
void destroy_marc8(yaz_iconv_decoder_t d)
yaz_conv_func_t yaz_marc8_45_conv
yaz_conv_func_t yaz_marc8_42_conv
yaz_conv_func_t yaz_marc8_32_conv
yaz_iconv_decoder_t yaz_marc8_decoder(const char *fromcode, yaz_iconv_decoder_t d)
static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, struct decoder_data *data, unsigned char *inp, size_t inbytesleft, size_t *no_read, int *comb)
yaz_conv_func_t yaz_marc8_4E_conv
yaz_conv_func_t yaz_marc8_67_conv
int yaz_matchstr(const char *s1, const char *s2)
match strings - independent of case and '-'
Definition: matchstr.c:42
size_t comb_no_read[8]
size_t no_read[MAX_COMP]
unsigned long comb_x[8]
unsigned long(* read_handle)(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inbuf, size_t inbytesleft, size_t *no_read)
Definition: iconv-p.h:86
size_t(* init_handle)(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inbuf, size_t inbytesleft, size_t *no_read)
Definition: iconv-p.h:83
void(* destroy_handle)(yaz_iconv_decoder_t d)
Definition: iconv-p.h:89
Header for memory handling functions.
#define xfree(x)
utility macro which calls xfree_f
Definition: xmalloc.h:53
#define xmalloc(x)
utility macro which calls malloc_f
Definition: xmalloc.h:49
#define YAZ_ICONV_EILSEQ
error code: Invalid sequence
Definition: yaz-iconv.h:49
#define YAZ_ICONV_EINVAL
error code: An incomplete multibyte sequence is in input buffer
Definition: yaz-iconv.h:51