YAZ  4.2.57
solr.c
Go to the documentation of this file.
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
9 #if HAVE_CONFIG_H
10 #include <config.h>
11 #endif
12 
13 #include <stdlib.h>
14 #include <assert.h>
15 #include <yaz/srw.h>
16 #include <yaz/matchstr.h>
17 #include <yaz/yaz-iconv.h>
18 #include <yaz/log.h>
19 #include <yaz/facet.h>
20 #include <yaz/wrbuf.h>
21 
22 #include "sru-p.h"
23 
24 #define SOLR_MAX_PARAMETERS 100
25 
26 #if YAZ_HAVE_XML2
27 #include <libxml/parser.h>
28 #include <libxml/tree.h>
29 
30 static void extract_text_node(xmlNodePtr node, WRBUF wrbuf) {
31  xmlNodePtr child;
32  for (child = node->children; child ; child = child->next)
33  {
34  if (child->type == XML_TEXT_NODE)
35  wrbuf_puts(wrbuf, (const char *) child->content);
36  }
37 }
38 
40  xmlNodePtr ptr,
41  const char *node_name, const char *attribute_name, const char *value)
42 {
43  const char *attribute_value;
44  // check if the node name matches
45  if (strcmp((const char*) ptr->name, node_name))
46  return 0;
47  if (attribute_name)
48  {
49  attribute_value = yaz_element_attribute_value_get(ptr, node_name,
50  attribute_name);
51  if (attribute_value && !strcmp(attribute_value, value))
52  return 1;
53  }
54  else /* No attribute to check */
55  return 1;
56  return 0;
57 }
58 
59 static void yaz_solr_decode_result_docs(ODR o, xmlNodePtr ptr,
60  Odr_int start,
62 {
63  xmlNodePtr node;
64  int offset = 0;
65  int i = 0;
66 
67  sr->num_records = 0;
68  for (node = ptr->children; node; node = node->next)
69  if (node->type == XML_ELEMENT_NODE)
70  sr->num_records++;
71 
72  if (sr->num_records)
73  sr->records = odr_malloc(o, sizeof(*sr->records) * sr->num_records);
74 
75  for (node = ptr->children; node; node = node->next)
76  {
77  if (node->type == XML_ELEMENT_NODE)
78  {
79  Z_SRW_record *record = sr->records + i;
80  xmlBufferPtr buf = xmlBufferCreate();
81  xmlNode *tmp = xmlCopyNode(node, 1);
82 
83  xmlNodeDump(buf, tmp->doc, tmp, 0, 0);
84 
85  xmlFreeNode(tmp);
86 
87  record->recordSchema = 0;
89  record->recordData_len = buf->use;
90  record->recordData_buf = odr_malloc(o, buf->use + 1);
91  memcpy(record->recordData_buf, buf->content, buf->use);
92  record->recordData_buf[buf->use] = '\0';
93  record->recordPosition = odr_intdup(o, start + offset + 1);
94 
95  xmlBufferFree(buf);
96 
97  offset++;
98  i++;
99  }
100  }
101 }
102 
103 static int yaz_solr_decode_result(ODR o, xmlNodePtr ptr,
105 {
106  Odr_int start = 0;
107  struct _xmlAttr *attr;
108  for (attr = ptr->properties; attr; attr = attr->next)
109  if (attr->children && attr->children->type == XML_TEXT_NODE)
110  {
111  if (!strcmp((const char *) attr->name, "numFound"))
112  {
114  (const char *) attr->children->content));
115  }
116  else if (!strcmp((const char *) attr->name, "start"))
117  {
118  start = odr_atoi((const char *) attr->children->content);
119  }
120  }
121  if (sr->numberOfRecords && *sr->numberOfRecords > 0)
122  yaz_solr_decode_result_docs(o, ptr, start, sr);
123  if (sr->numberOfRecords)
124  return 0;
125  return -1;
126 }
127 
128 static const char *get_facet_term_count(xmlNodePtr node, Odr_int *freq)
129 {
130  const char *term = yaz_element_attribute_value_get(node, "int", "name");
131  xmlNodePtr child;
132  WRBUF wrbuf = wrbuf_alloc();
133  if (!term)
134  return term;
135 
136  for (child = node->children; child ; child = child->next)
137  {
138  if (child->type == XML_TEXT_NODE)
139  wrbuf_puts(wrbuf, (const char *) child->content);
140  }
141  *freq = odr_atoi(wrbuf_cstr(wrbuf));
142  wrbuf_destroy(wrbuf);
143  return term;
144 }
145 
148 
149 {
150  Z_AttributeList *list;
151  Z_FacetField *facet_field;
152  int num_terms = 0;
153  int index = 0;
154  xmlNodePtr node;
155  // USE attribute
156  const char* name = yaz_element_attribute_value_get(ptr, "lst", "name");
157  list = yaz_use_attribute_create(o, name);
158  for (node = ptr->children; node; node = node->next)
159  num_terms++;
160  facet_field = facet_field_create(o, list, num_terms);
161  index = 0;
162  for (node = ptr->children; node; node = node->next)
163  {
164  Odr_int count = 0;
165  const char *term = get_facet_term_count(node, &count);
166  facet_field_term_set(o, facet_field,
167  facet_term_create_cstr(o, term, count), index);
168  index++;
169  }
170  return facet_field;
171 }
172 
173 static int yaz_solr_decode_facet_counts(ODR o, xmlNodePtr root,
175 {
176  xmlNodePtr ptr;
177  for (ptr = root->children; ptr; ptr = ptr->next)
178  {
179  if (match_xml_node_attribute(ptr, "lst", "name", "facet_fields"))
180  {
181  xmlNodePtr node;
182  Z_FacetList *facet_list;
183  int num_facets = 0;
184  for (node = ptr->children; node; node= node->next)
185  {
186  num_facets++;
187  }
188  facet_list = facet_list_create(o, num_facets);
189  num_facets = 0;
190  for (node = ptr->children; node; node= node->next)
191  {
192  facet_list_field_set(o, facet_list,
193  yaz_solr_decode_facet_field(o, node, sr),
194  num_facets);
195  num_facets++;
196  }
197  sr->facetList = facet_list;
198  break;
199  }
200  }
201  return 0;
202 }
203 
204 static void yaz_solr_decode_suggestion_values(xmlNodePtr listPptr, WRBUF wrbuf)
205 {
206  xmlNodePtr node;
207  for (node = listPptr; node; node= node->next) {
208  if (!strcmp((char*) node->name, "lst")) {
209  xmlNodePtr child;
210  for (child = node->children; child; child= child->next) {
211  if (match_xml_node_attribute(child, "str", "name", "word")) {
212  wrbuf_puts(wrbuf, "<suggestion>");
213  extract_text_node(child, wrbuf);
214  wrbuf_puts(wrbuf, "</suggestion>\n");
215  }
216  }
217  }
218  }
219 }
220 
221 static void yaz_solr_decode_suggestion_lst(xmlNodePtr lstPtr, WRBUF wrbuf)
222 {
223  xmlNodePtr node;
224  for (node = lstPtr; node; node= node->next) {
225  if (match_xml_node_attribute(node, "arr", "name", "suggestion")) {
226  yaz_solr_decode_suggestion_values(node->children, wrbuf);
227  }
228  }
229 }
230 
231 static void yaz_solr_decode_misspelled(xmlNodePtr lstPtr, WRBUF wrbuf)
232 {
233  xmlNodePtr node;
234  for (node = lstPtr; node; node= node->next)
235  {
236  if (!strcmp((const char*) node->name, "lst")) {
237  const char *misspelled = yaz_element_attribute_value_get(node, "lst", "name");
238  if (misspelled) {
239  wrbuf_printf(wrbuf, "<misspelled term=\"%s\">\n", misspelled);
240  yaz_solr_decode_suggestion_lst(node->children, wrbuf);
241  wrbuf_puts(wrbuf, "</misspelled>\n");
242  }
243  }
244  }
245 }
246 
247 static int yaz_solr_decode_spellcheck(ODR o, xmlNodePtr spellcheckPtr, Z_SRW_searchRetrieveResponse *sr)
248 {
249  xmlNodePtr ptr;
250  WRBUF wrbuf = wrbuf_alloc();
251  wrbuf_puts(wrbuf, "");
252  for (ptr = spellcheckPtr->children; ptr; ptr = ptr->next)
253  {
254  if (match_xml_node_attribute(ptr, "lst", "name", "suggestions"))
255  {
256  yaz_solr_decode_misspelled(ptr->children, wrbuf);
257  }
258  }
259  sr->suggestions = odr_strdup(o, wrbuf_cstr(wrbuf));
260  return 0;
261 }
262 
263 static int yaz_solr_decode_scan_result(ODR o, xmlNodePtr ptr,
264  Z_SRW_scanResponse *scr)
265 {
266  xmlNodePtr node;
267  char *pos;
268  int i = 0;
269 
270  /* find the actual list */
271  for (node = ptr->children; node; node = node->next)
272  if (node->type == XML_ELEMENT_NODE) {
273  ptr = node;
274  break;
275  }
276 
277  scr->num_terms = 0;
278  for (node = ptr->children; node; node = node->next)
279  if (node->type == XML_ELEMENT_NODE && !strcmp((const char *) node->name, "int"))
280  scr->num_terms++;
281 
282  if (scr->num_terms)
283  scr->terms = odr_malloc(o, sizeof(*scr->terms) * scr->num_terms);
284 
285  for (node = ptr->children; node; node = node->next)
286  {
287  if (node->type == XML_ELEMENT_NODE && !strcmp((const char *) node->name, "int"))
288  {
289  Z_SRW_scanTerm *term = scr->terms + i;
290 
291  Odr_int count = 0;
292  const char *val = get_facet_term_count(node, &count);
293 
294  term->numberOfRecords = odr_intdup(o, count);
295 
296  /* if val contains a ^ then it is probably term<^>display term so separate them. This is due to
297  * SOLR not being able to encode them into 2 separate attributes.
298  */
299  pos = strchr(val, '^');
300  if (pos != NULL) {
301  term->displayTerm = odr_strdup(o, pos + 1);
302  *pos = '\0';
303  term->value = odr_strdup(o, val);
304  *pos = '^';
305  } else {
306  term->value = odr_strdup(o, val);
307  term->displayTerm = NULL;
308  }
309  term->whereInList = NULL;
310 
311  i++;
312  }
313  }
314 
315  if (scr->num_terms)
316  return 0;
317  return -1;
318 }
319 #endif
320 
322 {
323 #if YAZ_HAVE_XML2
324  const char *content_buf = hres->content_buf;
325  int content_len = hres->content_len;
326  xmlDocPtr doc = xmlParseMemory(content_buf, content_len);
327  int ret = 0;
328  xmlNodePtr ptr = 0;
329  Z_SRW_PDU *pdu;
330  Z_SRW_searchRetrieveResponse *sr = NULL;
331  Z_SRW_scanResponse *scr = NULL;
332 
333  if (!doc)
334  {
335  ret = -1;
336  }
337  if (doc)
338  {
339  xmlNodePtr root = xmlDocGetRootElement(doc);
340  if (!root)
341  {
342  ret = -1;
343  }
344  else if (strcmp((const char *) root->name, "response"))
345  {
346  ret = -1;
347  }
348  else
349  {
351  int rc_result = -1;
352  int rc_facets = 0;
353  for (ptr = root->children; ptr; ptr = ptr->next)
354  {
355  if (ptr->type == XML_ELEMENT_NODE &&
356  !strcmp((const char *) ptr->name, "result")) {
358  sr = pdu->u.response;
359  rc_result = yaz_solr_decode_result(o, ptr, sr);
360  }
361  if (ptr->type == XML_ELEMENT_NODE &&
362  match_xml_node_attribute(ptr, "lst", "name", "terms")) {
364  scr = pdu->u.scan_response;
365  rc_result = yaz_solr_decode_scan_result(o, ptr, scr);
366  }
367  /* TODO The check on hits is a work-around to avoid garbled facets on zero results from the SOLR server.
368  * The work-around works because the results is before the facets in the xml. */
369  if (sr) {
370  if (rc_result == 0 && *sr->numberOfRecords > 0 &&
371  match_xml_node_attribute(ptr, "lst", "name", "facet_counts"))
372  rc_facets = yaz_solr_decode_facet_counts(o, ptr, sr);
373  if (rc_result == 0 && *sr->numberOfRecords == 0 &&
374  match_xml_node_attribute(ptr, "lst", "name", "spellcheck"))
375  rc_facets = yaz_solr_decode_spellcheck(o, ptr, sr);
376  }
377 
378  }
379  ret = rc_result + rc_facets;
380  }
381  }
382  if (doc)
383  xmlFreeDoc(doc);
384  if (ret == 0)
385  *pdup = pdu;
386  return ret;
387 #else
388  return -1;
389 #endif
390 }
391 
393  ODR encode, char **name, char **value, int *i,
394  Z_FacetField *facet_field)
395 {
396  Z_AttributeList *attribute_list = facet_field->attributes;
397  struct yaz_facet_attr attr_values;
398  yaz_facet_attr_init(&attr_values);
399  yaz_facet_attr_get_z_attributes(attribute_list, &attr_values);
400  // TODO do we want to support server decided
401 
402  if (attr_values.errcode)
403  return -1;
404  if (attr_values.useattr)
405  {
406  WRBUF wrbuf = wrbuf_alloc();
407  wrbuf_puts(wrbuf, (char *) attr_values.useattr);
408  yaz_add_name_value_str(encode, name, value, i,
409  "facet.field",
410  odr_strdup(encode, wrbuf_cstr(wrbuf)));
411  if (attr_values.limit > 0)
412  {
413  WRBUF wrbuf2 = wrbuf_alloc();
414  Odr_int olimit;
415  wrbuf_puts(wrbuf2, "f.");
416  wrbuf_puts(wrbuf2, wrbuf_cstr(wrbuf));
417  wrbuf_puts(wrbuf2, ".facet.limit");
418  olimit = attr_values.limit;
419  yaz_add_name_value_int(encode, name, value, i,
420  odr_strdup(encode, wrbuf_cstr(wrbuf2)),
421  &olimit);
422  wrbuf_destroy(wrbuf2);
423  }
424  wrbuf_destroy(wrbuf);
425  }
426  return 0;
427 }
428 
430  ODR encode, char **name, char **value,
431  int *i, Z_FacetList *facet_list)
432 {
433  int index;
434  for (index = 0; index < facet_list->num; index++)
435  {
436  int r = yaz_solr_encode_facet_field(encode, name, value, i,
437  facet_list->elements[index]);
438  if (r)
439  return -1;
440 
441  }
442  return 0;
443 }
444 
446  ODR encode, const char *charset)
447 {
448  const char *solr_op = 0;
449  //TODO Change. not a nice hard coded, unchecked limit.
451  char *uri_args;
452  char *path;
453  char *q;
454  char *pos;
455  int i = 0;
456 
457  z_HTTP_header_add_basic_auth(encode, &hreq->headers,
458  srw_pdu->username, srw_pdu->password);
459  if (srw_pdu->which == Z_SRW_searchRetrieve_request)
460  {
462  solr_op = "select";
463  switch (srw_pdu->u.request->query_type)
464  {
466  yaz_add_name_value_str(encode, name, value, &i,
467  "q", request->query.pqf);
468  break;
470  yaz_add_name_value_str(encode, name, value, &i,
471  "q", request->query.cql);
472  break;
473  default:
474  return -1;
475  }
476  if (srw_pdu->u.request->startRecord)
477  {
478  Odr_int start = *request->startRecord - 1;
479  yaz_add_name_value_int(encode, name, value, &i,
480  "start", &start);
481  }
482  yaz_add_name_value_int(encode, name, value, &i,
483  "rows", request->maximumRecords);
484  yaz_add_name_value_str(encode, name, value, &i,
485  "fl", request->recordSchema);
486 
487  switch(srw_pdu->u.request->sort_type)
488  {
490  break;
492  yaz_add_name_value_str(encode, name, value, &i, "sort",
493  srw_pdu->u.request->sort.sortKeys);
494  break;
495  }
496  if (request->facetList)
497  {
498  Z_FacetList *facet_list = request->facetList;
499  yaz_add_name_value_str(encode, name, value, &i, "facet", "true");
500  yaz_add_name_value_str(encode, name, value, &i, "facet.mincount", "1");
501  if (yaz_solr_encode_facet_list(encode, name, value, &i, facet_list))
502  return -1;
503  }
504  }
505  else if (srw_pdu->which == Z_SRW_scan_request) {
507  solr_op = "terms";
508  switch (srw_pdu->u.scan_request->query_type)
509  {
511  yaz_add_name_value_str(encode, name, value, &i,
512  "terms.fl", request->scanClause.pqf);
513  yaz_add_name_value_str(encode, name, value, &i,
514  "terms.lower", request->scanClause.pqf);
515  break;
517  q = request->scanClause.cql;
518  pos = strchr(q, ':');
519  if (pos != NULL) {
520  yaz_add_name_value_str(encode, name, value, &i,
521  "terms.lower", odr_strdup(encode, pos + 1));
522  *pos = '\0';
523  yaz_add_name_value_str(encode, name, value, &i,
524  "terms.fl", odr_strdup(encode, q));
525  *pos = ':';
526  } else {
527  yaz_add_name_value_str(encode, name, value, &i,
528  "terms.lower", odr_strdup(encode, q));
529  }
530  break;
531  default:
532  return -1;
533  }
534  yaz_add_name_value_str(encode, name, value, &i,
535  "terms.sort", "index");
536  yaz_add_name_value_int(encode, name, value, &i,
537  "terms.limit", request->maximumTerms);
538  }
539  else
540  return -1;
541 
542  if (srw_pdu->extra_args)
543  {
544  Z_SRW_extra_arg *ea = srw_pdu->extra_args;
545  for (; ea && i < SOLR_MAX_PARAMETERS; ea = ea->next)
546  {
547  name[i] = ea->name;
548  value[i] = ea->value;
549  i++;
550  }
551  }
552 
553  name[i++] = 0;
554 
555  yaz_array_to_uri(&uri_args, encode, name, value);
556 
557  hreq->method = "GET";
558 
559  path = (char *)
560  odr_malloc(encode, strlen(hreq->path) +
561  strlen(uri_args) + strlen(solr_op) + 4);
562 
563  sprintf(path, "%s/%s?%s", hreq->path, solr_op, uri_args);
564  hreq->path = path;
565 
567  "text/xml", charset);
568  return 0;
569 }
570 
571 
572 /*
573  * Local variables:
574  * c-basic-offset: 4
575  * c-file-style: "Stroustrup"
576  * indent-tabs-mode: nil
577  * End:
578  * vim: shiftwidth=4 tabstop=8 expandtab
579  */
580