metaproxy  1.13.0
html_parser.cpp
Go to the documentation of this file.
1 /* This file is part of Metaproxy.
2  Copyright (C) Index Data
3 
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18 
19 #include "config.hpp"
20 #include "html_parser.hpp"
21 
22 #include <assert.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <ctype.h>
26 #include <stdio.h>
27 #include <yaz/matchstr.h>
28 
29 #define SPACECHR " \t\r\n\f"
30 
31 // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
32 
33 namespace metaproxy_1 {
35  friend class HTMLParser;
36  public:
37  void parse_str(HTMLParserEvent &event, const char *cp);
38  void tagText(HTMLParserEvent &event,
39  const char *text_start, const char *text_end);
40  int tagEnd(HTMLParserEvent &event,
41  const char *tag, int tag_len, const char *cp);
42  int tagAttrs(HTMLParserEvent &event,
43  const char *name, int len,
44  const char *cp);
45  int skipAttribute(HTMLParserEvent &event,
46  const char *cp, int *attr_len,
47  const char **value, int *val_len, int *tr);
48  Rep();
49  ~Rep();
50  int m_verbose;
51  bool nest;
52  };
53 }
54 
55 namespace mp = metaproxy_1;
56 
57 mp::HTMLParser::Rep::Rep()
58 {
59  m_verbose = 0;
60  nest = true;
61 }
62 
63 mp::HTMLParser::Rep::~Rep()
64 {
65 }
66 
68 {
69 }
70 
72 {
73 }
74 
76 {
77  m_p->m_verbose = v;
78 }
79 
80 
81 void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const
82 {
83  m_p->parse_str(event, str);
84 }
85 
86 static int isAlpha(int c)
87 {
88  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
89 }
90 
91 static int skipSpace(const char *cp)
92 {
93  int i = 0;
94  while (cp[i] && strchr(SPACECHR, cp[i]))
95  i++;
96  return i;
97 }
98 
99 static int skipName(const char *cp)
100 {
101  int i;
102  for (i = 0; cp[i] && !strchr(SPACECHR "/><=", cp[i]); i++)
103  ;
104  return i;
105 }
106 
107 int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event,
108  const char *cp, int *attr_len,
109  const char **value, int *val_len,
110  int *tr)
111 {
112  int v0, v1;
113  int i = skipName(cp);
114  *attr_len = i;
115  *value = NULL;
116  if (!i)
117  return skipSpace(cp);
118  i += skipSpace(cp + i);
119  if (cp[i] == '=')
120  {
121  i++;
122  i += skipSpace(cp + i);
123  if (cp[i] == '\"' || cp[i] == '\'')
124  {
125  *tr = cp[i];
126  v0 = ++i;
127  while (cp[i] != *tr && cp[i])
128  i++;
129  v1 = i;
130  if (cp[i])
131  i++;
132  }
133  else
134  {
135  *tr = 0;
136  v0 = i;
137  while (cp[i] && !strchr(SPACECHR ">", cp[i]))
138  i++;
139  v1 = i;
140  }
141  *value = cp + v0;
142  *val_len = v1 - v0;
143  i += skipSpace(cp + i);
144  }
145  return i;
146 }
147 
148 int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event,
149  const char *name, int len,
150  const char *cp)
151 {
152  int i = skipSpace(cp);
153  while (cp[i] && !strchr("/><", cp[i]))
154  {
155  const char *attr_name = cp + i;
156  int attr_len;
157  const char *value;
158  int val_len;
159  int tr;
160  char x[2];
161  int nor = skipAttribute(event, cp+i, &attr_len, &value, &val_len, &tr);
162  if (!nor)
163  break;
164  i += nor;
165 
166  x[0] = tr;
167  x[1] = 0;
168  if (m_verbose)
169  {
170  printf("------ attr %.*s", attr_len, attr_name);
171  if (value)
172  printf("=%.*s", val_len, value);
173  printf("\n");
174  }
175  event.attribute(name, len, attr_name, attr_len, value, val_len, x);
176  }
177  return i;
178 }
179 
180 int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event,
181  const char *tag, int tag_len, const char *cp)
182 {
183  int i = 0;
184  int close_it = 0;
185  for (; cp[i] && !strchr("/><", cp[i]); i++)
186  ;
187  if (i > 0)
188  {
189  if (m_verbose)
190  printf("------ text %.*s\n", i, cp);
191  event.text(cp, i);
192  }
193  if (cp[i] == '/')
194  {
195  close_it = 1;
196  i++;
197  }
198  if (cp[i] == '>')
199  {
200  if (m_verbose)
201  printf("------ any tag %s %.*s\n",
202  close_it ? "close" : "end", tag_len, tag);
203  event.anyTagEnd(tag, tag_len, close_it);
204  i++;
205  }
206  return i;
207 }
208 
209 void mp::HTMLParser::Rep::tagText(HTMLParserEvent &event,
210  const char *text_start, const char *text_end)
211 {
212  if (text_end - text_start) //got text to flush
213  {
214  if (m_verbose)
215  printf("------ text %.*s\n",
216  (int) (text_end - text_start), text_start);
217  event.text(text_start, text_end-text_start);
218  }
219 }
220 
221 void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp)
222 {
223  const char *text_start = cp;
224  while (*cp)
225  {
226  if (*cp++ != '<')
227  continue;
228 
229  if (nest && *cp == '!')
230  {
231  int i;
232  tagText(event, text_start, cp - 1);
233  if (cp[1] == '-' && cp[2] == '-')
234  {
235  for (i = 3; cp[i]; i++)
236  if (cp[i] == '-' && cp[i+1] == '-' && cp[i+2] == '>')
237  {
238  i+= 2;
239  event.openTagStart(cp, i);
240  break;
241  }
242  }
243  else
244  {
245  for (i = 1; cp[i] && cp[i] != '>'; i++)
246  ;
247  event.openTagStart(cp, i);
248  }
249  if (m_verbose)
250  printf("------ dtd %.*s\n", i, cp);
251  i += tagEnd(event, cp, i, cp + i);
252  cp += i;
253  text_start = cp;
254  }
255  else if (nest && *cp == '?')
256  {
257  int i;
258  tagText(event, text_start, cp - 1);
259  for (i = 1; cp[i] && cp[i] != '>'; i++)
260  ;
261  event.openTagStart(cp, i);
262  if (m_verbose)
263  printf("------ pi %.*s\n", i, cp);
264  i += tagEnd(event, cp, i, cp + i);
265  cp += i;
266  text_start = cp;
267  }
268  else if (*cp == '/' && isAlpha(cp[1]))
269  {
270  int i;
271 
272  i = skipName(++cp);
273 
274  if (!nest)
275  {
276  if (i == 6 && !yaz_strncasecmp(cp, "script", i))
277  {
278  int ws = skipSpace(cp + 6);
279  if (cp[ws + 6] == '>')
280  nest = true; /* really terminated */
281  }
282  if (!nest)
283  continue;
284  }
285  tagText(event, text_start, cp - 2);
286  event.closeTag(cp, i);
287  if (m_verbose)
288  printf("------ tag close %.*s\n", i, cp);
289  i += tagEnd(event, cp, i, cp + i);
290  cp += i;
291  text_start = cp;
292  }
293  else if (nest && isAlpha(*cp))
294  {
295  int i, j;
296  tagText(event, text_start, cp - 1);
297  i = skipName(cp);
298  event.openTagStart(cp, i);
299  if (m_verbose)
300  printf("------ tag open %.*s\n", i, cp);
301  j = tagAttrs(event, cp, i, cp + i);
302  j += tagEnd(event, cp, i, cp + i + j);
303 
304  if (i == 6 && !yaz_strncasecmp(cp, "script", i))
305  nest = false;
306 
307  cp += i + j;
308  text_start = cp;
309  }
310  }
311  tagText(event, text_start, cp);
312 }
313 
315 {
316 }
317 
318 /*
319  * Local variables:
320  * c-basic-offset: 4
321  * c-file-style: "Stroustrup"
322  * indent-tabs-mode: nil
323  * End:
324  * vim: shiftwidth=4 tabstop=8 expandtab
325  */
326 
int tagEnd(HTMLParserEvent &event, const char *tag, int tag_len, const char *cp)
int tagAttrs(HTMLParserEvent &event, const char *name, int len, const char *cp)
void tagText(HTMLParserEvent &event, const char *text_start, const char *text_end)
static int skipSpace(const char *cp)
Definition: html_parser.cpp:91
void parse(HTMLParserEvent &event, const char *str) const
Definition: html_parser.cpp:81
boost::scoped_ptr< Rep > m_p
Definition: html_parser.hpp:46
#define SPACECHR
Definition: html_parser.cpp:29
static int skipName(const char *cp)
Definition: html_parser.cpp:99
void parse_str(HTMLParserEvent &event, const char *cp)
static int isAlpha(int c)
Definition: html_parser.cpp:86
int skipAttribute(HTMLParserEvent &event, const char *cp, int *attr_len, const char **value, int *val_len, int *tr)