Coverage for ckanext/udc/views.py: 13%
331 statements
« prev ^ index » next coverage.py v7.7.1, created at 2026-01-19 23:48 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2026-01-19 23:48 +0000
1# -*- coding: utf-8 -*-
2from __future__ import annotations
3import logging
5from datetime import datetime
6from collections import OrderedDict
7from functools import partial
8from typing import Any, Iterable, Optional, Union, cast
9from ckanext.udc.graph.logic import get_catalogue_graph
10from werkzeug.datastructures import MultiDict
12from flask import Blueprint
14import ckan.lib.base as base
15from ckan.lib.helpers import helper_functions as h
16from ckan.lib.helpers import Page
17import ckan.lib.navl.dictization_functions as dict_fns
18import ckan.logic as logic
19import ckan.model as model
20import ckan.plugins as plugins
21import ckan.authz as authz
22import ckan.plugins.toolkit as tk
23import ckan.model as model
25from ckan.lib.helpers import Page
26from ckan.common import asbool, current_user, CKANConfig, request, g, config, _
27from ckan.views.dataset import _sort_by, _pager_url, _setup_template_variables, _get_pkg_template
28from ckan.lib.search import SearchQueryError, SearchError
30from ckan.types import Context, Response
32import chalk
33import re
34from urllib.parse import urlencode
35from ckanext.udc.solr.config import get_current_lang
37log = logging.getLogger(__name__)
38bp = Blueprint("catalogue_search", __name__)
42def remove_field(package_type: Optional[str],
43 key: str,
44 value: Optional[str] = None,
45 replace: Optional[str] = None):
46 if not package_type:
47 package_type = u'dataset'
48 url = h.url_for(u'{0}.search'.format(package_type))
50 if key.startswith('extras_'):
51 # Remove the extras_ prefix
52 key = key[7:]
53 elif key.endswith('_ngram'):
54 # Remove the _ngram suffix
55 key = key[:-6]
57 params_items = request.args.items(multi=True)
58 params_nopage = [
59 (k, v) for k, v in params_items
60 if k != 'page'
61 ]
62 params = list(params_nopage)
63 if value:
64 # Assume `fts_` and `exact_` cannot be used together with the same value
65 # print(chalk.red(f"Genererate remove link for Key: {key}, Value: {value}"))
66 if (f"exact_{key}", value) in params:
67 params.remove((f"exact_{key}", value))
68 if (f"fts_{key}", value) in params:
69 params.remove((f"fts_{key}", value))
70 if (key, value) in params:
71 params.remove((key, value))
73 else:
74 for (k, v) in params[:]:
75 if k == key:
76 if (f"exact_{key}", value) in params:
77 params.remove((f"exact_{key}", value))
78 if (f"fts_{key}", value) in params:
79 params.remove((f"fts_{key}", value))
80 if (k, v) in params:
81 params.remove((k, v))
83 if replace is not None:
84 params.append((key, replace))
86 params = [(k, v.encode('utf-8') if isinstance(v, str) else str(v))
87 for k, v in params]
88 return url + u'?' + urlencode(params)
91# Solr type = text_general/text
92# FTS only
93# TODO: If exact match is needed, we need to alter the ckan solr schema
94CKAN_FTS_FIELDS = ["title", "notes", "url", "version",
95 "author", "author_email", "maintainer", "maintainer_email"]
97# Core string facets: exact match; optional _ngram if you enable Option B
98CORE_STRING_FACETS = ["organization", "license_id"]
100def _solr_field_for(param_kind: str, ui_field: str, lang: str,
101 text_fields: set[str]) -> str | None:
102 """
103 Map stable UI field names to concrete Solr fields.
104 param_kind: 'fts' | 'exact' | 'min' | 'max'
105 """
106 # Text maturity fields (multilingual)
107 if ui_field in text_fields:
108 if param_kind == 'fts':
109 return f"{ui_field}_{lang}_txt"
110 if param_kind == 'exact':
111 return f"{ui_field}_{lang}_f"
112 if param_kind in ('min', 'max'):
113 return f"extras_{ui_field}"
115 # Tags are multilingual too
116 if ui_field == "tags":
117 if param_kind == 'fts':
118 return f"tags_{lang}_txt"
119 if param_kind == 'exact':
120 return f"tags_{lang}_f"
122 # CKAN core text fields: fts only
123 if ui_field in CKAN_FTS_FIELDS:
124 if param_kind == 'fts':
125 return f"{ui_field}_{lang}_txt"
126 # ignore exact_ for these
127 return None
129 # Core string facets
130 if ui_field in CORE_STRING_FACETS:
131 if param_kind == 'exact':
132 return ui_field
133 # enable FTS via _ngram (Not supported in current schema)
134 # if param_kind == 'fts':
135 # return f"{ui_field}_ngram"
137 # Everything else (selects / numbers / dates etc) use extras_*
138 if param_kind in ('min', 'max'):
139 return f"extras_{ui_field}"
140 # exact match for non-text -> extras_*
141 if param_kind == 'exact':
142 return f"extras_{ui_field}"
143 # fts on non-text: allow user-entered terms as exact values on extras_*
144 if param_kind == 'fts':
145 return f"extras_{ui_field}"
147 return None
149def _get_search_details() -> dict[str, Any]:
150 fq = u''
152 # fields_grouped will contain a dict of params containing
153 # a list of values eg {u'tags':[u'tag1', u'tag2']}
155 fields = []
156 fields_grouped = {} # key: solr_field -> { ui, values|min|max, fts:bool }
157 filter_logics = {} # key: ui_field -> 'AND' or 'OR'
158 include_undefined = set() # set of solr field names for date/number 'include empty'
159 search_extras: 'MultiDict[str, Any]' = MultiDict()
161 udc = plugins.get_plugin('udc')
162 # Get the list of text fields from the udc plugin
163 # Only the text fields support full text search
164 text_fields = udc.text_fields
166 # Get the list of date fields from the udc plugin
167 date_fields = udc.date_fields
169 lang = request.args.get('lang') or get_current_lang()
172 # Solr type = string
173 # FTS + exact
174 # For FTS match, we need to add suffix `_ngram` to the field name (ckan solr schema is altered to support this)
175 ckan_fields_exact = ["tags", "organization", "license_id"]
177 for (param, value) in request.args.items(multi=True):
178 # Ignore internal parameters
179 if param not in [u'q', u'page', u'sort'] and len(value) and not param.startswith(u'_'):
180 print(chalk.green(f"Param: {param}, Value: {value}"))
182 # Toggle logic
183 if param.startswith('filter-logic-'):
184 ui_name = param[13:]
185 if value.lower() == 'and':
186 filter_logics[ui_name] = 'AND'
187 elif value == 'date':
188 # include undefined date
189 solr_key = _solr_field_for('min', ui_name, lang, text_fields) # same base field
190 if solr_key:
191 include_undefined.add(solr_key)
192 elif value == 'number':
193 solr_key = _solr_field_for('min', ui_name, lang, text_fields)
194 if solr_key:
195 include_undefined.add(solr_key)
196 continue
198 # fts_*
199 if param.startswith('fts_'):
200 ui_name = param[4:]
201 solr_key = _solr_field_for('fts', ui_name, lang, text_fields)
202 if not solr_key:
203 continue
204 fields_grouped.setdefault(solr_key, {'ui': ui_name, 'fts': True, 'values': []})
205 fields_grouped[solr_key]['values'].append(value)
206 continue
208 # exact_*
209 if param.startswith('exact_'):
210 ui_name = param[6:]
211 solr_key = _solr_field_for('exact', ui_name, lang, text_fields)
212 if not solr_key:
213 continue
214 fields_grouped.setdefault(solr_key, {'ui': ui_name, 'fts': False, 'values': []})
215 fields_grouped[solr_key]['values'].append(value)
216 continue
218 # min_/max_ (numbers/dates on extras_*)
219 if param.startswith('min_'):
220 ui_name = param[4:]
221 solr_key = _solr_field_for('min', ui_name, lang, text_fields)
222 if not solr_key:
223 continue
224 fields_grouped.setdefault(solr_key, {'ui': ui_name})
225 fields_grouped[solr_key]['min'] = value
226 continue
228 if param.startswith('max_'):
229 ui_name = param[4:]
230 solr_key = _solr_field_for('max', ui_name, lang, text_fields)
231 if not solr_key:
232 continue
233 fields_grouped.setdefault(solr_key, {'ui': ui_name})
234 fields_grouped[solr_key]['max'] = value
235 continue
237 # legacy / unknown -> pass-through as extras
238 if not param.startswith(u'ext_'):
239 fields.append((param, value))
240 else:
241 search_extras.update({param: value})
243 # Build fq
244 from datetime import datetime
245 for solr_key, opts in fields_grouped.items():
246 # values group
247 if 'values' in opts:
248 vals = opts['values']
249 ui_name = opts.get('ui', solr_key)
250 logic_op = filter_logics.get(ui_name, 'OR')
251 if len(vals) > 1:
252 joined = f' {logic_op} '.join([f'"{v}"' for v in vals])
253 fq += f' {solr_key}:({joined})'
254 else:
255 fq += f' {solr_key}:"{vals[0]}"'
256 continue
258 # range group (dates / numbers on extras_*)
259 _min = opts.get('min')
260 _max = opts.get('max')
261 # Date normalization if the underlying UI field was a date field
262 ui_name = opts.get('ui')
263 if ui_name and solr_key.startswith("extras_") and ui_name in date_fields:
264 try:
265 # Convert date to UTC ISO format
266 if _min:
267 _min = datetime.strptime(_min, '%Y-%m-%d').strftime('%Y-%m-%dT%H:%M:%SZ')
268 if _max:
269 d = datetime.strptime(_max, '%Y-%m-%d')
270 # Add 23:59:59 to the max date to include the whole day
271 _max = d.replace(hour=23, minute=59, second=59).strftime('%Y-%m-%dT%H:%M:%SZ')
272 except ValueError:
273 # If the date is not in the correct format, skip it
274 continue
276 # Handle min and max values for number and date ranges
277 if _min and _max:
278 range_query = f' {solr_key}:[{_min} TO {_max}]'
279 elif _min:
280 range_query = f' {solr_key}:[{_min} TO *]'
281 elif _max:
282 range_query = f' {solr_key}:[* TO {_max}]'
283 else:
284 range_query = ""
286 # Handle undefined date and number ranges
287 if range_query:
288 if solr_key in include_undefined:
289 range_query = f'({range_query} OR (*:* -{solr_key}:[* TO *]))'
290 fq += range_query
292 extras = dict((k, v[0]) if len(v) == 1 else (k, v)
293 for k, v in search_extras.lists())
295 return {
296 u'fields': fields,
297 u'fields_grouped': fields_grouped,
298 u'fq': fq,
299 u'search_extras': extras,
300 'filter_logics': filter_logics
301 }
304def _facet_alias_map(facet_keys: list[str], lang: str) -> tuple[list[str], dict[str, str]]:
305 """
306 Given stable facet keys, return (solr_facet_fields, alias_to_solr_map).
307 - text maturity fields in udc.text_fields -> <name>_<lang>_f
308 - tags -> tags_<lang>_f
309 - extras_* and other core facets -> as-is
310 """
311 udc = plugins.get_plugin('udc')
312 text_fields = set(udc.text_fields or [])
314 alias_to_solr = OrderedDict()
315 for key in facet_keys:
316 if key == "tags":
317 alias_to_solr[key] = f"tags_{lang}_f"
318 elif key.startswith("extras_"):
319 alias_to_solr[key] = key
320 elif key in text_fields:
321 alias_to_solr[key] = f"{key}_{lang}_f"
322 else:
323 alias_to_solr[key] = key
325 solr_fields = list(dict.fromkeys(alias_to_solr.values()))
326 return solr_fields, alias_to_solr
329@bp.route(
330 "/catalogue",
331 endpoint="search",
332 strict_slashes=False
333)
334def custom_dataset_search():
335 package_type = 'catalogue'
336 extra_vars: dict[str, Any] = {}
338 try:
339 context = cast(Context, {
340 u'model': model,
341 u'user': current_user.name,
342 u'auth_user_obj': current_user
343 })
344 logic.check_access(u'site_read', context)
345 except logic.NotAuthorized:
346 base.abort(403, _(u'Not authorized to see this page'))
348 # unicode format (decoded from utf8)
349 extra_vars[u'q'] = q = request.args.get(u'q', u'')
351 extra_vars['query_error'] = False
352 page = h.get_page_number(request.args)
354 limit = config.get(u'ckan.datasets_per_page')
356 # print(chalk.green(f"Page: {page}, Limit: {limit}, Query: {q}, Package Type: {package_type}, Current User: {current_user.name}"))
358 # most search operations should reset the page counter:
359 params_nopage = [(k, v) for k, v in request.args.items(multi=True)
360 if k != u'page']
362 # remove_field is a partial function that will remove a field from the search
363 # results. It is used in the search results template to generate links that
364 # remove a field from the search results.
365 extra_vars[u'remove_field'] = partial(remove_field, package_type)
367 # print("Remove field: ", extra_vars[u'remove_field'])
369 sort_by = request.args.get(u'sort', None)
370 params_nosort = [(k, v) for k, v in params_nopage if k != u'sort']
372 extra_vars[u'sort_by'] = partial(_sort_by, params_nosort, package_type)
373 # print("Sort by: ", sort_by)
375 if not sort_by:
376 sort_by_fields = []
377 else:
378 sort_by_fields = [field.split()[0] for field in sort_by.split(u',')]
379 extra_vars[u'sort_by_fields'] = sort_by_fields
381 pager_url = partial(_pager_url, params_nopage, package_type)
383 details = _get_search_details()
384 print(details)
385 extra_vars[u'fields'] = details[u'fields']
386 extra_vars[u'fields_grouped'] = details[u'fields_grouped']
387 extra_vars[u'filter_logics'] = details[u'filter_logics']
388 fq = details[u'fq']
389 search_extras = details[u'search_extras']
391 context = cast(Context, {
392 u'model': model,
393 u'session': model.Session,
394 u'user': current_user.name,
395 u'for_view': True,
396 u'auth_user_obj': current_user
397 })
399 # Unless changed via config options, don't show other dataset
400 # types any search page. Potential alternatives are do show them
401 # on the default search page (dataset) or on one other search page
402 search_all_type = config.get(u'ckan.search.show_all_types')
403 search_all = False
405 try:
406 # If the "type" is set to True or False, convert to bool
407 # and we know that no type was specified, so use traditional
408 # behaviour of applying this only to dataset type
409 search_all = asbool(search_all_type)
410 search_all_type = u'dataset'
411 # Otherwise we treat as a string representing a type
412 except ValueError:
413 search_all = True
415 if not search_all or package_type != search_all_type:
416 # Only show datasets of this particular type
417 fq += u' +dataset_type:{type}'.format(type=package_type)
419 facets: dict[str, str] = OrderedDict()
421 org_label = h.humanize_entity_type(
422 u'organization',
423 h.default_group_type(u'organization'),
424 u'facet label') or _(u'Organizations')
426 group_label = h.humanize_entity_type(
427 u'group',
428 h.default_group_type(u'group'),
429 u'facet label') or _(u'Groups')
431 default_facet_titles = {
432 u'organization': org_label,
433 u'groups': group_label,
434 u'tags': _(u'Tags'),
435 u'res_format': _(u'Formats'),
436 u'license_id': _(u'Licenses'),
437 }
439 for facet in h.facets():
440 if facet in default_facet_titles:
441 facets[facet] = default_facet_titles[facet]
442 else:
443 facets[facet] = facet
445 # Facet titles
446 for plugin in plugins.PluginImplementations(plugins.IFacets):
447 facets = plugin.dataset_facets(facets, package_type)
449 facet_fields = list(facets.keys())
450 # # Remove date facet as it is not supported by solr
451 # for date_field in h.date_fields:
452 # if date_field in facets:
453 # print("Removing date field: ", date_field)
454 # facet_fields.remove(date_field)
455 # facets.pop(date_field)
456 # print("Facet Fields: ", facet_fields)
458 extra_vars[u'facet_titles'] = facets
459 # extra_vars[u'facet_titles'].update(plugins.get_plugin('udc').facet_titles)
460 # print(chalk.yellow(f"Facet Titles: {extra_vars[u'facet_titles']}"))
462 lang = request.args.get('lang') or get_current_lang()
463 facet_fields_stable = list(facets.keys())
464 solr_facet_fields, alias_to_solr = _facet_alias_map(facet_fields_stable, lang)
466 data_dict: dict[str, Any] = {
467 u'q': q,
468 u'fq': fq.strip(),
469 u'facet.field': solr_facet_fields,
470 # u'facet.limit': -1,
471 u'rows': limit,
472 u'start': (page - 1) * limit,
473 u'sort': sort_by,
474 u'extras': search_extras,
475 u'include_private': config.get(
476 u'ckan.search.default_include_private'),
477 }
478 # print(chalk.green(f"Data Dict: {data_dict}"))
479 try:
480 query = logic.get_action(u'package_search')(context, data_dict)
482 extra_vars[u'sort_by_selected'] = query[u'sort']
484 extra_vars[u'page'] = Page(
485 collection=query[u'results'],
486 page=page,
487 url=pager_url,
488 item_count=query[u'count'],
489 items_per_page=limit
490 )
491 # print(chalk.red("search_facets"), query[u'search_facets'])
493 raw_facets = query.get('search_facets', {})
494 search_facets_stable = {}
495 for alias, solr_name in alias_to_solr.items():
496 if solr_name in raw_facets:
497 search_facets_stable[alias] = raw_facets[solr_name]
499 extra_vars[u'search_facets'] = search_facets_stable
500 # extra_vars[u'search_facets'] = query[u'search_facets']
501 extra_vars[u'page'].items = query[u'results']
502 except SearchQueryError as se:
503 # User's search parameters are invalid, in such a way that is not
504 # achievable with the web interface, so return a proper error to
505 # discourage spiders which are the main cause of this.
506 log.info(u'Dataset search query rejected: %r', se.args)
507 base.abort(
508 400,
509 _(u'Invalid search query: {error_message}')
510 .format(error_message=str(se))
511 )
512 except SearchError as se:
513 # May be bad input from the user, but may also be more serious like
514 # bad code causing a SOLR syntax error, or a problem connecting to
515 # SOLR
516 log.error(u'Dataset search error: %r', se.args)
517 extra_vars[u'query_error'] = True
518 extra_vars[u'search_facets'] = {}
519 extra_vars[u'page'] = Page(collection=[])
521 # FIXME: try to avoid using global variables
522 g.search_facets_limits = {}
523 default_limit: int = config.get(u'search.facets.default')
524 for facet in cast(Iterable[str], extra_vars[u'search_facets'].keys()):
525 try:
526 limit = int(
527 request.args.get(
528 u'_%s_limit' % facet,
529 default_limit
530 )
531 )
532 except ValueError:
533 base.abort(
534 400,
535 _(u'Parameter u"{parameter_name}" is not '
536 u'an integer').format(parameter_name=u'_%s_limit' % facet)
537 )
539 g.search_facets_limits[facet] = limit
541 _setup_template_variables(context, {}, package_type=package_type)
543 extra_vars[u'dataset_type'] = package_type
545 # TODO: remove
546 for key, value in extra_vars.items():
547 setattr(g, key, value)
549 # print(chalk.green(f"Extra Vars: {extra_vars}"))
550 return base.render('package/custom_search.html', extra_vars)
553# Redirect /dataset to /catalogue
554@bp.route(
555 "/dataset",
556 endpoint="redirect-search",
557 strict_slashes=False
558)
559def redirect_to_catalogue_search():
560 # Redirect to the catalogue search page
561 new_path = request.path.replace("/dataset", "/catalogue", 1)
562 query = request.query_string.decode("utf-8") if request.query_string else ""
563 lang = request.environ.get("CKAN_LANG")
564 is_default = request.environ.get("CKAN_LANG_IS_DEFAULT", True)
565 if lang and not is_default and not new_path.startswith(f"/{lang}/"):
566 new_path = f"/{lang}{new_path}"
567 new_url = f"{new_path}?{query}" if query else new_path
568 return tk.redirect_to(new_url)
572# Blueprint for raw graph endpoints
573graph_blueprint = Blueprint('udc_graph', __name__)
576@graph_blueprint.route('/catalogue/<package_id>/graph')
577@graph_blueprint.route('/catalogue/<package_id>/graph.<format>')
578def package_graph(package_id, format=None):
579 """
580 Return the knowledge graph for a package in raw RDF format.
582 URL: /catalogue/<package_id>/graph[.format]
583 """
584 from flask import Response, request, abort
586 # Get format from URL extension or query parameter
587 if not format:
588 format = request.args.get('format', 'turtle')
590 # Map file extensions to format names
591 format_mapping = {
592 'ttl': 'turtle',
593 'turtle': 'turtle',
594 'jsonld': 'json-ld',
595 'json-ld': 'json-ld',
596 'rdf': 'xml',
597 'xml': 'xml',
598 'n3': 'n3',
599 'nt': 'nt',
600 }
602 format = format_mapping.get(format.lower(), format.lower())
604 # Validate format
605 valid_formats = {'turtle', 'json-ld', 'xml', 'n3', 'nt', 'pretty-xml'}
606 if format not in valid_formats:
607 abort(400, description=f"Invalid format '{format}'. Must be one of: {', '.join(valid_formats)}")
609 # Check if GraphDB is disabled
610 if plugins.get_plugin('udc').disable_graphdb:
611 abort(503, description='Knowledge graph feature is disabled. GraphDB connection is not available.')
613 # Check access - use package_show authorization
614 try:
615 tk.check_access('package_show', {'user': current_user.name if current_user else None},
616 {'id': package_id})
617 except tk.NotAuthorized:
618 abort(403, description='Not authorized to access this package')
619 except tk.ObjectNotFound:
620 abort(404, description=f'Package not found: {package_id}')
622 # Get the graph
623 try:
624 graph_data = get_catalogue_graph(package_id, format)
625 except ValueError as e:
626 log.error(f"Error retrieving graph for package {package_id}: {str(e)}")
627 abort(404, description=str(e))
628 except Exception as e:
629 log.error(f"Unexpected error retrieving graph for package {package_id}: {str(e)}")
630 abort(500, description='Error occurred while retrieving knowledge graph')
632 # Set appropriate content type
633 content_types = {
634 'turtle': 'text/turtle; charset=utf-8',
635 'json-ld': 'application/ld+json; charset=utf-8',
636 'xml': 'application/rdf+xml; charset=utf-8',
637 'pretty-xml': 'application/rdf+xml; charset=utf-8',
638 'n3': 'text/n3; charset=utf-8',
639 'nt': 'application/n-triples; charset=utf-8',
640 }
642 content_type = content_types.get(format, 'text/plain; charset=utf-8')
644 return Response(graph_data, mimetype=content_type)