YAZ  5.23.1
iconv_encode_iso_8859_1.c
Go to the documentation of this file.
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
11 #if HAVE_CONFIG_H
12 #include <config.h>
13 #endif
14 
15 #include <assert.h>
16 #include <errno.h>
17 #include <string.h>
18 
19 #include <yaz/xmalloc.h>
20 #include "iconv-p.h"
21 
23 {
24  unsigned long compose_char;
25 };
26 
27 
28 
29 static struct {
30  unsigned long x1, x2;
31  unsigned y;
32 } latin1_comb[] = {
33  { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
34  { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
35  { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
36  { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
37  { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
38  { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
39  /* no need for 0xc6 LATIN CAPITAL LETTER AE */
40  { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
41  { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
42  { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
43  { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
44  { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
45  { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
46  { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
47  { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
48  { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
49  { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
50  { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
51  { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
52  { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
53  { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
54  { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
55  /* omitted: 0xd7 MULTIPLICATION SIGN */
56  /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
57  { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
58  { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
59  { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
60  { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
61  { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
62  /* omitted: 0xde LATIN CAPITAL LETTER THORN */
63  /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
64  { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
65  { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
66  { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
67  { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
68  { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
69  { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
70  /* omitted: 0xe6 LATIN SMALL LETTER AE */
71  { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
72  { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
73  { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
74  { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
75  { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
76  { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
77  { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
78  { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
79  { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
80  /* omitted: 0xf0 LATIN SMALL LETTER ETH */
81  { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
82  { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
83  { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
84  { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
85  { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
86  { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
87  /* omitted: 0xf7 DIVISION SIGN */
88  /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
89  { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
90  { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
91  { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
92  { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
93  { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
94  /* omitted: 0xfe LATIN SMALL LETTER THORN */
95  { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
96 
97  { 0, 0, 0}
98 };
99 
100 int yaz_iso_8859_1_lookup_y(unsigned long v,
101  unsigned long *x1, unsigned long *x2)
102 {
103  if (v >= 0xc0 && v <= 0xff) /* optimization. min and max .y values */
104  {
105  int i;
106  for (i = 0; latin1_comb[i].x1; i++)
107  {
108  if (v == latin1_comb[i].y)
109  {
110  *x1 = latin1_comb[i].x1;
111  *x2 = latin1_comb[i].x2;
112  return 1;
113  }
114  }
115  }
116  return 0;
117 }
118 
119 int yaz_iso_8859_1_lookup_x12(unsigned long x1, unsigned long x2,
120  unsigned long *y)
121 {
122  /* For MARC8s we try to get a Latin-1 page code out of it */
123  int i;
124  for (i = 0; latin1_comb[i].x1; i++)
125  if (x2 == latin1_comb[i].x2 && x1 == latin1_comb[i].x1)
126  {
127  *y = latin1_comb[i].y;
128  return 1;
129  }
130  return 0;
131 }
132 
134  unsigned long x,
135  char **outbuf, size_t *outbytesleft)
136 {
137  struct encoder_data *w = (struct encoder_data *) e->data;
138  /* list of two char unicode sequence that, when combined, are
139  equivalent to single unicode chars that can be represented in
140  ISO-8859-1/Latin-1.
141  Regular iconv on Linux at least does not seem to convert these,
142  but since MARC-8 to UTF-8 generates these composed sequence
143  we get a better chance of a successful MARC-8 -> ISO-8859-1
144  conversion */
145  unsigned char *outp = (unsigned char *) *outbuf;
146 
147  if (w->compose_char)
148  {
149  int i;
150  for (i = 0; latin1_comb[i].x1; i++)
151  if (w->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
152  {
153  x = latin1_comb[i].y;
154  break;
155  }
156  if (*outbytesleft < 1)
157  { /* no room. Retain compose_char and bail out */
159  return (size_t)(-1);
160  }
161  if (!latin1_comb[i].x1)
162  { /* not found. Just write compose_char */
163  *outp++ = (unsigned char) w->compose_char;
164  (*outbytesleft)--;
165  *outbuf = (char *) outp;
166  }
167  /* compose_char used so reset it. x now holds current char */
168  w->compose_char = 0;
169  }
170 
171  if (x > 32 && x < 127 && w->compose_char == 0)
172  {
173  w->compose_char = x;
174  return 0;
175  }
176  else if (x > 255 || x < 1)
177  {
179  return (size_t) -1;
180  }
181  else if (*outbytesleft < 1)
182  {
184  return (size_t)(-1);
185  }
186  *outp++ = (unsigned char) x;
187  (*outbytesleft)--;
188  *outbuf = (char *) outp;
189  return 0;
190 }
191 
193  char **outbuf, size_t *outbytesleft)
194 {
195  struct encoder_data *w = (struct encoder_data *) e->data;
196  if (w->compose_char)
197  {
198  unsigned char *outp = (unsigned char *) *outbuf;
199  if (*outbytesleft < 1)
200  {
202  return (size_t)(-1);
203  }
204  *outp++ = (unsigned char) w->compose_char;
205  (*outbytesleft)--;
206  *outbuf = (char *) outp;
207  w->compose_char = 0;
208  }
209  return 0;
210 }
211 
212 
214 {
215  struct encoder_data *w = (struct encoder_data *) e->data;
216  w->compose_char = 0;
217 }
218 
220 {
221  xfree(e->data);
222 }
223 
226 
227 {
228  if (!yaz_matchstr(tocode, "iso88591"))
229  {
230  struct encoder_data *data = (struct encoder_data *)
231  xmalloc(sizeof(*data));
232  e->data = data;
237  return e;
238  }
239  return 0;
240 }
241 
242 static unsigned long read_ISO8859_1(yaz_iconv_t cd,
244  unsigned char *inp,
245  size_t inbytesleft, size_t *no_read)
246 {
247  unsigned long x = inp[0];
248  *no_read = 1;
249  return x;
250 }
251 
254 
255 {
256  if (!yaz_matchstr(fromcode, "iso88591"))
257  {
259  return d;
260  }
261  return 0;
262 }
263 
264 
265 /*
266  * Local variables:
267  * c-basic-offset: 4
268  * c-file-style: "Stroustrup"
269  * indent-tabs-mode: nil
270  * End:
271  * vim: shiftwidth=4 tabstop=8 expandtab
272  */
273 
unsigned long x2
unsigned long compose_char
Header for errno utilities.
void(* init_handle)(yaz_iconv_encoder_t e)
Definition: iconv-p.h:50
size_t(* flush_handle)(yaz_iconv_t cd, yaz_iconv_encoder_t e, char **outbuf, size_t *outbytesleft)
Definition: iconv-p.h:48
size_t(* write_handle)(yaz_iconv_t cd, yaz_iconv_encoder_t e, unsigned long x, char **outbuf, size_t *outbytesleft)
Definition: iconv-p.h:45
static size_t write_iso_8859_1(yaz_iconv_t cd, yaz_iconv_encoder_t e, unsigned long x, char **outbuf, size_t *outbytesleft)
Internal header for iconv.
#define xfree(x)
utility macro which calls xfree_f
Definition: xmalloc.h:53
int yaz_iso_8859_1_lookup_x12(unsigned long x1, unsigned long x2, unsigned long *y)
static unsigned long read_ISO8859_1(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read)
#define YAZ_ICONV_EILSEQ
error code: Invalid sequence
Definition: yaz-iconv.h:49
unsigned y
static size_t flush_iso_8859_1(yaz_iconv_t cd, yaz_iconv_encoder_t e, char **outbuf, size_t *outbytesleft)
void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
Definition: siconv.c:298
void(* destroy_handle)(yaz_iconv_encoder_t e)
Definition: iconv-p.h:51
int yaz_matchstr(const char *s1, const char *s2)
match strings - independent of case and &#39;-&#39;
Definition: matchstr.c:42
#define xmalloc(x)
utility macro which calls malloc_f
Definition: xmalloc.h:49
unsigned long x1
yaz_iconv_decoder_t yaz_iso_8859_1_decoder(const char *fromcode, yaz_iconv_decoder_t d)
static struct @0 latin1_comb[]
int yaz_iso_8859_1_lookup_y(unsigned long v, unsigned long *x1, unsigned long *x2)
void destroy_iso_8859_1(yaz_iconv_encoder_t e)
Header for memory handling functions.
#define YAZ_ICONV_E2BIG
error code: Not sufficient room for output buffer
Definition: yaz-iconv.h:47
unsigned long(* read_handle)(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inbuf, size_t inbytesleft, size_t *no_read)
Definition: iconv-p.h:86
yaz_iconv_encoder_t yaz_iso_8859_1_encoder(const char *tocode, yaz_iconv_encoder_t e)
void init_iso_8859_1(yaz_iconv_encoder_t e)