IDZEBRA  2.2.7
grepper.c
Go to the documentation of this file.
1 /* This file is part of the Zebra server.
2  Copyright (C) Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
18 */
19 
20 
21 #if HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24 #include <stdlib.h>
25 #include <string.h>
26 #include <stdio.h>
27 #include <ctype.h>
28 #include <assert.h>
29 
30 #include <idzebra/util.h>
31 #include <yaz/yaz-util.h>
32 #include <dfa.h>
33 #include "imalloc.h"
34 
35 char *prog;
36 static int show_line = 0;
37 
38 typedef unsigned MatchWord;
39 #define WORD_BITS 32
40 
41 typedef struct {
42  int n; /* no of MatchWord needed */
43  int range; /* max no. of errors */
44  MatchWord *Sc; /* Mask Sc */
45 } MatchContext;
46 
47 #define INFBUF_SIZE 16384
48 
49 #define INLINE
50 
51 static INLINE void set_bit (MatchContext *mc, MatchWord *m, int ch, int state)
52 {
53  int off = state & (WORD_BITS-1);
54  int wno = state / WORD_BITS;
55 
56  m[mc->n * ch + wno] |= 1<<off;
57 }
58 
59 static INLINE void reset_bit (MatchContext *mc, MatchWord *m, int ch,
60  int state)
61 {
62  int off = state & (WORD_BITS-1);
63  int wno = state / WORD_BITS;
64 
65  m[mc->n * ch + wno] &= ~(1<<off);
66 }
67 
68 static INLINE MatchWord get_bit (MatchContext *mc, MatchWord *m, int ch,
69  int state)
70 {
71  int off = state & (WORD_BITS-1);
72  int wno = state / WORD_BITS;
73 
74  return m[mc->n * ch + wno] & (1<<off);
75 }
76 
77 static MatchContext *mk_MatchContext (struct DFA *dfa, int range)
78 {
79  MatchContext *mc = imalloc (sizeof(*mc));
80  int i;
81 
82  mc->n = (dfa->no_states+WORD_BITS) / WORD_BITS;
83  mc->range = range;
84  mc->Sc = icalloc (sizeof(*mc->Sc) * 256 * mc->n);
85 
86  for (i=0; i<dfa->no_states; i++)
87  {
88  int j;
89  struct DFA_state *state = dfa->states[i];
90 
91  for (j=0; j<state->tran_no; j++)
92  {
93  int ch;
94  int ch0 = state->trans[j].ch[0];
95  int ch1 = state->trans[j].ch[1];
96  assert (ch0 >= 0 && ch1 >= 0);
97 
98  for (ch = ch0; ch <= ch1; ch++)
99  set_bit (mc, mc->Sc, ch, i);
100  }
101  }
102  return mc;
103 }
104 
105 
106 static void mask_shift (MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc,
107  struct DFA *dfa, int ch)
108 {
109  int j, s = 0;
110  MatchWord *Rsrc_p = Rsrc, mask;
111 
112  Rdst[0] = 1;
113  for (j = 1; j<mc->n; j++)
114  Rdst[j] = 0;
115  while (1)
116  {
117  mask = *Rsrc_p++;
118  for (j = 0; j<WORD_BITS/4; j++)
119  {
120  if (mask & 15)
121  {
122  if (mask & 1)
123  {
124  struct DFA_state *state = dfa->states[s];
125  int i = state->tran_no;
126  while (--i >= 0)
127  if (ch >= state->trans[i].ch[0] &&
128  ch <= state->trans[i].ch[1])
129  set_bit (mc, Rdst, 0, state->trans[i].to);
130  }
131  if (mask & 2)
132  {
133  struct DFA_state *state = dfa->states[s+1];
134  int i = state->tran_no;
135  while (--i >= 0)
136  if (ch >= state->trans[i].ch[0] &&
137  ch <= state->trans[i].ch[1])
138  set_bit (mc, Rdst, 0, state->trans[i].to);
139  }
140  if (mask & 4)
141  {
142  struct DFA_state *state = dfa->states[s+2];
143  int i = state->tran_no;
144  while (--i >= 0)
145  if (ch >= state->trans[i].ch[0] &&
146  ch <= state->trans[i].ch[1])
147  set_bit (mc, Rdst, 0, state->trans[i].to);
148  }
149  if (mask & 8)
150  {
151  struct DFA_state *state = dfa->states[s+3];
152  int i = state->tran_no;
153  while (--i >= 0)
154  if (ch >= state->trans[i].ch[0] &&
155  ch <= state->trans[i].ch[1])
156  set_bit (mc, Rdst, 0, state->trans[i].to);
157  }
158  }
159  s += 4;
160  if (s >= dfa->no_states)
161  return;
162  mask >>= 4;
163  }
164  }
165 }
166 
167 static void shift (MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc,
168  struct DFA *dfa)
169 {
170  int j, s = 0;
171  MatchWord *Rsrc_p = Rsrc, mask;
172  for (j = 0; j<mc->n; j++)
173  Rdst[j] = 0;
174  while (1)
175  {
176  mask = *Rsrc_p++;
177  for (j = 0; j<WORD_BITS/4; j++)
178  {
179  if (mask & 15)
180  {
181  if (mask & 1)
182  {
183  struct DFA_state *state = dfa->states[s];
184  int i = state->tran_no;
185  while (--i >= 0)
186  set_bit (mc, Rdst, 0, state->trans[i].to);
187  }
188  if (mask & 2)
189  {
190  struct DFA_state *state = dfa->states[s+1];
191  int i = state->tran_no;
192  while (--i >= 0)
193  set_bit (mc, Rdst, 0, state->trans[i].to);
194  }
195  if (mask & 4)
196  {
197  struct DFA_state *state = dfa->states[s+2];
198  int i = state->tran_no;
199  while (--i >= 0)
200  set_bit (mc, Rdst, 0, state->trans[i].to);
201  }
202  if (mask & 8)
203  {
204  struct DFA_state *state = dfa->states[s+3];
205  int i = state->tran_no;
206  while (--i >= 0)
207  set_bit (mc, Rdst, 0, state->trans[i].to);
208  }
209  }
210  s += 4;
211  if (s >= dfa->no_states)
212  return;
213  mask >>= 4;
214  }
215  }
216 }
217 
218 static void or (MatchContext *mc, MatchWord *Rdst,
219  MatchWord *Rsrc1, MatchWord *Rsrc2)
220 {
221  int i;
222  for (i = 0; i<mc->n; i++)
223  Rdst[i] = Rsrc1[i] | Rsrc2[i];
224 }
225 
226 
227 static int go (MatchContext *mc, struct DFA *dfa, FILE *inf)
228 {
229  MatchWord *Rj, *Rj1, *Rj_a, *Rj_b, *Rj_c;
230  int s, d, ch;
231  int lineno = 1;
232  char *infbuf;
233  int inf_ptr = 1;
234  int no_match = 0;
235 
236  infbuf = imalloc (INFBUF_SIZE);
237  infbuf[0] = '\n';
238  Rj = icalloc (mc->n * (mc->range+1) * sizeof(*Rj));
239  Rj1 = icalloc (mc->n * (mc->range+1) * sizeof(*Rj));
240  Rj_a = icalloc (mc->n * sizeof(*Rj));
241  Rj_b = icalloc (mc->n * sizeof(*Rj));
242  Rj_c = icalloc (mc->n * sizeof(*Rj));
243 
244  set_bit (mc, Rj, 0, 0);
245  for (d = 1; d<=mc->range; d++)
246  {
247  int s;
248  memcpy (Rj + mc->n * d, Rj + mc->n * (d-1), mc->n * sizeof(*Rj));
249  for (s = 0; s<dfa->no_states; s++)
250  {
251  if (get_bit (mc, Rj, d-1, s))
252  {
253  struct DFA_state *state = dfa->states[s];
254  int i = state->tran_no;
255  while (--i >= 0)
256  set_bit (mc, Rj, d, state->trans[i].to);
257  }
258  }
259  }
260  while ((ch = getc (inf)) != EOF)
261  {
262  MatchWord *Rj_t;
263 
264  infbuf[inf_ptr] = ch;
265  if (ch == '\n')
266  {
267  if (no_match)
268  {
269  int i = inf_ptr;
270  if (show_line)
271  printf ("%5d:", lineno);
272  do
273  {
274  if (--i < 0)
275  i = INFBUF_SIZE-1;
276  } while (infbuf[i] != '\n');
277  do
278  {
279  if (++i == INFBUF_SIZE)
280  i = 0;
281  putchar (infbuf[i]);
282  } while (infbuf[i] != '\n');
283  no_match = 0;
284  }
285  lineno++;
286  }
287  if (++inf_ptr == INFBUF_SIZE)
288  inf_ptr = 0;
289  mask_shift (mc, Rj1, Rj, dfa, ch);
290  for (d = 1; d <= mc->range; d++)
291  {
292  mask_shift (mc, Rj_b, Rj+d*mc->n, dfa, ch); /* 1 */
293 
294  or (mc, Rj_a, Rj+(d-1)*mc->n, Rj1+(d-1)*mc->n); /* 2,3 */
295 
296  shift (mc, Rj_c, Rj_a, dfa);
297 
298  or (mc, Rj_a, Rj_b, Rj_c); /* 1,2,3*/
299 
300  or (mc, Rj1+d*mc->n, Rj_a, Rj+(d-1)*mc->n); /* 1,2,3,4 */
301  }
302  for (s = 0; s<dfa->no_states; s++)
303  {
304  if (dfa->states[s]->rule_no)
305  if (get_bit (mc, Rj1+mc->range*mc->n, 0, s))
306  no_match++;
307  }
308  for (d = 0; d <= mc->range; d++)
309  reset_bit (mc, Rj1+d*mc->n, 0, dfa->no_states);
310  Rj_t = Rj1;
311  Rj1 = Rj;
312  Rj = Rj_t;
313  }
314  ifree (Rj);
315  ifree (Rj1);
316  ifree (Rj_a);
317  ifree (Rj_b);
318  ifree (Rj_c);
319  ifree (infbuf);
320  return 0;
321 }
322 
323 static int grep_file (struct DFA *dfa, const char *fname, int range)
324 {
325  FILE *inf;
326  MatchContext *mc;
327 
328  if (fname)
329  {
330  inf = fopen (fname, "r");
331  if (!inf)
332  {
333  yaz_log (YLOG_FATAL|YLOG_ERRNO, "cannot open `%s'", fname);
334  exit (1);
335  }
336  }
337  else
338  inf = stdin;
339 
340  mc = mk_MatchContext (dfa, range);
341 
342  go (mc, dfa, inf);
343 
344  if (fname)
345  fclose (inf);
346  return 0;
347 }
348 
349 int main (int argc, char **argv)
350 {
351  int ret;
352  int range = 0;
353  char *arg;
354  const char *pattern = NULL;
355  int no_files = 0;
356  struct DFA *dfa = dfa_init();
357 
358  prog = argv[0];
359  while ((ret = options ("nr:dsv:", argv, argc, &arg)) != -2)
360  {
361  if (ret == 0)
362  {
363  if (!pattern)
364  {
365  int i;
366  pattern = arg;
367  i = dfa_parse (dfa, &pattern);
368  if (i || *pattern)
369  {
370  fprintf (stderr, "%s: illegal pattern\n", prog);
371  return 1;
372  }
373  dfa_mkstate (dfa);
374  }
375  else
376  {
377  no_files++;
378  grep_file (dfa, arg, range);
379  }
380  }
381  else if (ret == 'v')
382  {
383  yaz_log_init (yaz_log_mask_str(arg), prog, NULL);
384  }
385  else if (ret == 's')
386  {
387  dfa_verbose = 1;
388  }
389  else if (ret == 'd')
390  {
391  debug_dfa_tran = 1;
393  debug_dfa_trav = 1;
394  }
395  else if (ret == 'r')
396  {
397  range = atoi (arg);
398  }
399  else if (ret == 'n')
400  {
401  show_line = 1;
402  }
403  else
404  {
405  yaz_log (YLOG_FATAL, "Unknown option '-%s'", arg);
406  exit (1);
407  }
408  }
409  if (!pattern)
410  {
411  fprintf (stderr, "usage:\n "
412  " %s [-d] [-n] [-r n] [-s] [-v n] pattern file ..\n", prog);
413  exit (1);
414  }
415  else if (no_files == 0)
416  {
417  grep_file (dfa, NULL, range);
418  }
419  dfa_delete (&dfa);
420  return 0;
421 }
422 /*
423  * Local variables:
424  * c-basic-offset: 4
425  * c-file-style: "Stroustrup"
426  * indent-tabs-mode: nil
427  * End:
428  * vim: shiftwidth=4 tabstop=8 expandtab
429  */
430 
static char * inf_ptr
Definition: agrep.c:110
int dfa_parse(struct DFA *, const char **)
Definition: dfa.c:1121
void dfa_mkstate(struct DFA *)
Definition: dfa.c:1148
void dfa_delete(struct DFA **)
Definition: dfa.c:1158
int debug_dfa_followpos
Definition: dfa.c:68
int debug_dfa_trav
Definition: dfa.c:66
int dfa_verbose
Definition: dfa.c:69
int debug_dfa_tran
Definition: dfa.c:67
struct DFA * dfa_init(void)
Definition: dfa.c:1092
static INLINE void set_bit(MatchContext *mc, MatchWord *m, int ch, int state)
Definition: grepper.c:51
static INLINE void reset_bit(MatchContext *mc, MatchWord *m, int ch, int state)
Definition: grepper.c:59
unsigned MatchWord
Definition: grepper.c:38
static int go(MatchContext *mc, struct DFA *dfa, FILE *inf)
Definition: grepper.c:227
#define INLINE
Definition: grepper.c:49
static void mask_shift(MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc, struct DFA *dfa, int ch)
Definition: grepper.c:106
int main(int argc, char **argv)
Definition: grepper.c:349
static void or(MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc1, MatchWord *Rsrc2)
Definition: grepper.c:218
static int show_line
Definition: grepper.c:36
#define WORD_BITS
Definition: grepper.c:39
static INLINE MatchWord get_bit(MatchContext *mc, MatchWord *m, int ch, int state)
Definition: grepper.c:68
static int grep_file(struct DFA *dfa, const char *fname, int range)
Definition: grepper.c:323
char * prog
Definition: grepper.c:35
static void shift(MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc, struct DFA *dfa)
Definition: grepper.c:167
#define INFBUF_SIZE
Definition: grepper.c:47
static MatchContext * mk_MatchContext(struct DFA *dfa, int range)
Definition: grepper.c:77
void ifree(void *p)
Definition: imalloc.c:93
void * icalloc(size_t size)
Definition: imalloc.c:68
void * imalloc(size_t size)
Definition: imalloc.c:43
static FILE * inf
Definition: readfile.c:37
Definition: dfa.h:42
short rule_no
Definition: dfa.h:49
short tran_no
Definition: dfa.h:48
struct DFA_tran * trans
Definition: dfa.h:45
unsigned short to
Definition: dfa.h:32
unsigned char ch[2]
Definition: dfa.h:31
Definition: dfa.h:53
struct DFA_state ** states
Definition: dfa.h:55
int no_states
Definition: dfa.h:54
MatchWord * Sc
Definition: grepper.c:44
int range
Definition: grepper.c:43