YAZ  5.23.1
utf8.c
Go to the documentation of this file.
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13 
14 #include <assert.h>
15 #include <errno.h>
16 #include <string.h>
17 
18 #include "iconv-p.h"
19 
21  unsigned char *inp,
22  size_t inbytesleft, size_t *no_read)
23 {
24  if (!inp || inp[0] != 0xef)
25  {
26  *no_read = 0;
27  return 0;
28  }
29  if (inbytesleft < 3)
30  {
32  return (size_t) -1;
33  }
34  if (inp[1] != 0xbb && inp[2] == 0xbf)
35  *no_read = 3;
36  else
37  *no_read = 0;
38  return 0;
39 }
40 
41 unsigned long yaz_read_UTF8_char(const unsigned char *inp,
42  size_t inbytesleft, size_t *no_read,
43  int *error)
44 {
45  unsigned long x = 0;
46 
47  *no_read = 0; /* by default */
48  if (inp[0] <= 0x7f)
49  {
50  x = inp[0];
51  *no_read = 1;
52  }
53  else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
54  {
55  *error = YAZ_ICONV_EILSEQ;
56  }
57  else if (inp[0] <= 0xdf && inbytesleft >= 2)
58  {
59  if ((inp[1] & 0xc0) == 0x80)
60  {
61  x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
62  if (x >= 0x80)
63  *no_read = 2;
64  else
65  *error = YAZ_ICONV_EILSEQ;
66  }
67  else
68  *error = YAZ_ICONV_EILSEQ;
69  }
70  else if (inp[0] <= 0xef && inbytesleft >= 3)
71  {
72  if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80)
73  {
74  x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
75  (inp[2] & 0x3f);
76  if (x >= 0x800)
77  *no_read = 3;
78  else
79  *error = YAZ_ICONV_EILSEQ;
80  }
81  else
82  *error = YAZ_ICONV_EILSEQ;
83  }
84  else if (inp[0] <= 0xf7 && inbytesleft >= 4)
85  {
86  if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80
87  && (inp[3] & 0xc0) == 0x80)
88  {
89  x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
90  ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
91  if (x >= 0x10000)
92  *no_read = 4;
93  else
94  *error = YAZ_ICONV_EILSEQ;
95  }
96  else
97  *error = YAZ_ICONV_EILSEQ;
98  }
99  else if (inp[0] <= 0xfb && inbytesleft >= 5)
100  {
101  if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80
102  && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80)
103  {
104  x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
105  ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
106  (inp[4] & 0x3f);
107  if (x >= 0x200000)
108  *no_read = 5;
109  else
110  *error = YAZ_ICONV_EILSEQ;
111  }
112  else
113  *error = YAZ_ICONV_EILSEQ;
114  }
115  else if (inp[0] <= 0xfd && inbytesleft >= 6)
116  {
117  if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80
118  && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80
119  && (inp[5] & 0xc0) == 0x80)
120  {
121  x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
122  ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
123  ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
124  if (x >= 0x4000000)
125  *no_read = 6;
126  else
127  *error = YAZ_ICONV_EILSEQ;
128  }
129  else
130  *error = YAZ_ICONV_EILSEQ;
131  }
132  else
133  *error = YAZ_ICONV_EINVAL; /* incomplete sentence */
134 
135  return x;
136 }
137 
138 static unsigned long read_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
139  unsigned char *inp,
140  size_t inbytesleft, size_t *no_read)
141 {
142  int err = 0;
143  int r = yaz_read_UTF8_char(inp, inbytesleft, no_read, &err);
144  yaz_iconv_set_errno(cd, err);
145  return r;
146 }
147 
148 
150  unsigned long x,
151  char **outbuf, size_t *outbytesleft)
152 {
153  int err = 0;
154  int r = yaz_write_UTF8_char(x, outbuf, outbytesleft, &err);
155  yaz_iconv_set_errno(cd, err);
156  return r;
157 }
158 
159 size_t yaz_write_UTF8_char(unsigned long x,
160  char **outbuf, size_t *outbytesleft,
161  int *error)
162 {
163  unsigned char *outp = (unsigned char *) *outbuf;
164 
165  if (x <= 0x7f && *outbytesleft >= 1)
166  {
167  *outp++ = (unsigned char) x;
168  (*outbytesleft)--;
169  }
170  else if (x <= 0x7ff && *outbytesleft >= 2)
171  {
172  *outp++ = (unsigned char) ((x >> 6) | 0xc0);
173  *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
174  (*outbytesleft) -= 2;
175  }
176  else if (x <= 0xffff && *outbytesleft >= 3)
177  {
178  *outp++ = (unsigned char) ((x >> 12) | 0xe0);
179  *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
180  *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
181  (*outbytesleft) -= 3;
182  }
183  else if (x <= 0x1fffff && *outbytesleft >= 4)
184  {
185  *outp++ = (unsigned char) ((x >> 18) | 0xf0);
186  *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
187  *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
188  *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
189  (*outbytesleft) -= 4;
190  }
191  else if (x <= 0x3ffffff && *outbytesleft >= 5)
192  {
193  *outp++ = (unsigned char) ((x >> 24) | 0xf8);
194  *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
195  *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
196  *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
197  *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
198  (*outbytesleft) -= 5;
199  }
200  else if (*outbytesleft >= 6)
201  {
202  *outp++ = (unsigned char) ((x >> 30) | 0xfc);
203  *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
204  *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
205  *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
206  *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
207  *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
208  (*outbytesleft) -= 6;
209  }
210  else
211  {
212  *error = YAZ_ICONV_E2BIG; /* not room for output */
213  return (size_t)(-1);
214  }
215  *outbuf = (char *) outp;
216  return 0;
217 }
218 
221 
222 {
223  if (!yaz_matchstr(tocode, "UTF8"))
224  {
226  return e;
227  }
228  return 0;
229 }
230 
233 {
234  if (!yaz_matchstr(fromcode, "UTF8"))
235  {
236  d->init_handle = init_utf8;
237  d->read_handle = read_utf8;
238  return d;
239  }
240  return 0;
241 }
242 
243 int yaz_utf8_check(const char *str)
244 {
245  /* cast OK: yaz_read_UTF8_char is read-only */
246  unsigned char *inp = (unsigned char *) str;
247  size_t inbytesleft = strlen(str);
248 
249  while (inbytesleft)
250  {
251  int error = 0;
252  size_t no_read;
253  yaz_read_UTF8_char(inp, inbytesleft, &no_read, &error);
254  if (error)
255  return 0;
256  inp += no_read;
257  inbytesleft -= no_read;
258  }
259  return 1;
260 }
261 
262 /*
263  * Local variables:
264  * c-basic-offset: 4
265  * c-file-style: "Stroustrup"
266  * indent-tabs-mode: nil
267  * End:
268  * vim: shiftwidth=4 tabstop=8 expandtab
269  */
270 
int yaz_utf8_check(const char *str)
check whether string apppers to be UTF-8 encoded
Definition: utf8.c:243
size_t(* init_handle)(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inbuf, size_t inbytesleft, size_t *no_read)
Definition: iconv-p.h:83
Header for errno utilities.
static size_t init_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read)
Definition: utf8.c:20
size_t(* write_handle)(yaz_iconv_t cd, yaz_iconv_encoder_t e, unsigned long x, char **outbuf, size_t *outbytesleft)
Definition: iconv-p.h:45
Internal header for iconv.
static size_t write_UTF8(yaz_iconv_t cd, yaz_iconv_encoder_t en, unsigned long x, char **outbuf, size_t *outbytesleft)
Definition: utf8.c:149
unsigned long yaz_read_UTF8_char(const unsigned char *inp, size_t inbytesleft, size_t *no_read, int *error)
Definition: utf8.c:41
yaz_iconv_decoder_t yaz_utf8_decoder(const char *fromcode, yaz_iconv_decoder_t d)
Definition: utf8.c:231
#define YAZ_ICONV_EINVAL
error code: An incomplete multibyte sequence is in input buffer
Definition: yaz-iconv.h:51
static unsigned long read_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read)
Definition: utf8.c:138
#define YAZ_ICONV_EILSEQ
error code: Invalid sequence
Definition: yaz-iconv.h:49
void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
Definition: siconv.c:298
size_t yaz_write_UTF8_char(unsigned long x, char **outbuf, size_t *outbytesleft, int *error)
encodes UTF-8 sequence
Definition: utf8.c:159
int yaz_matchstr(const char *s1, const char *s2)
match strings - independent of case and &#39;-&#39;
Definition: matchstr.c:42
#define YAZ_ICONV_E2BIG
error code: Not sufficient room for output buffer
Definition: yaz-iconv.h:47
unsigned long(* read_handle)(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inbuf, size_t inbytesleft, size_t *no_read)
Definition: iconv-p.h:86
yaz_iconv_encoder_t yaz_utf8_encoder(const char *tocode, yaz_iconv_encoder_t e)
Definition: utf8.c:219