Coverage for ckanext/udc/views.py: 13%
347 statements
« prev ^ index » next coverage.py v7.7.1, created at 2026-03-30 22:15 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2026-03-30 22:15 +0000
1# -*- coding: utf-8 -*-
2from __future__ import annotations
3import logging
5from datetime import datetime
6from collections import OrderedDict
7from functools import partial
8from typing import Any, Iterable, Optional, Union, cast
9from ckanext.udc.graph.logic import get_catalogue_graph
10from werkzeug.datastructures import MultiDict
12from flask import Blueprint
14import ckan.lib.base as base
15from ckan.lib.helpers import helper_functions as h
16from ckan.lib.helpers import Page
17import ckan.lib.navl.dictization_functions as dict_fns
18import ckan.logic as logic
19import ckan.model as model
20import ckan.plugins as plugins
21import ckan.authz as authz
22import ckan.plugins.toolkit as tk
23import ckan.model as model
25from ckan.lib.helpers import Page
26from ckan.common import asbool, current_user, CKANConfig, request, g, config, _
27from ckan.views.dataset import _sort_by, _pager_url, _setup_template_variables, _get_pkg_template
28from ckan.lib.search import SearchQueryError, SearchError
30from ckan.types import Context, Response
32import chalk
33import re
34from urllib.parse import urlencode
35from ckanext.udc.solr.config import get_current_lang
37log = logging.getLogger(__name__)
38bp = Blueprint("catalogue_search", __name__)
42def remove_field(package_type: Optional[str],
43 key: str,
44 value: Optional[str] = None,
45 replace: Optional[str] = None):
46 if not package_type:
47 package_type = u'dataset'
48 url = h.url_for(u'{0}.search'.format(package_type))
50 if key.startswith('extras_'):
51 # Remove the extras_ prefix
52 key = key[7:]
53 elif key.endswith('_ngram'):
54 # Remove the _ngram suffix
55 key = key[:-6]
57 params_items = request.args.items(multi=True)
58 params_nopage = [
59 (k, v) for k, v in params_items
60 if k != 'page'
61 ]
62 params = list(params_nopage)
63 if value:
64 # Assume `fts_` and `exact_` cannot be used together with the same value
65 # print(chalk.red(f"Genererate remove link for Key: {key}, Value: {value}"))
66 if (f"exact_{key}", value) in params:
67 params.remove((f"exact_{key}", value))
68 if (f"fts_{key}", value) in params:
69 params.remove((f"fts_{key}", value))
70 if (key, value) in params:
71 params.remove((key, value))
73 else:
74 for (k, v) in params[:]:
75 if k == key:
76 if (f"exact_{key}", value) in params:
77 params.remove((f"exact_{key}", value))
78 if (f"fts_{key}", value) in params:
79 params.remove((f"fts_{key}", value))
80 if (k, v) in params:
81 params.remove((k, v))
83 if replace is not None:
84 params.append((key, replace))
86 params = [(k, v.encode('utf-8') if isinstance(v, str) else str(v))
87 for k, v in params]
88 return url + u'?' + urlencode(params)
91# Solr type = text_general/text
92# FTS only
93# TODO: If exact match is needed, we need to alter the ckan solr schema
94CKAN_FTS_FIELDS = ["title", "notes", "url", "version",
95 "author", "author_email", "maintainer", "maintainer_email"]
97# Core string facets: exact match; optional _ngram if you enable Option B
98CORE_STRING_FACETS = ["organization", "groups", "license_id", "res_format"]
100def _solr_field_for(param_kind: str, ui_field: str, lang: str,
101 text_fields: set[str]) -> str | None:
102 """
103 Map stable UI field names to concrete Solr fields.
104 param_kind: 'fts' | 'exact' | 'min' | 'max'
105 """
106 # Text maturity fields (multilingual)
107 if ui_field in text_fields:
108 if param_kind == 'fts':
109 return f"{ui_field}_{lang}_txt"
110 if param_kind == 'exact':
111 return f"{ui_field}_{lang}_f"
112 if param_kind in ('min', 'max'):
113 return f"extras_{ui_field}"
115 # Tags are multilingual too
116 if ui_field == "tags":
117 if param_kind == 'fts':
118 return f"tags_{lang}_txt"
119 if param_kind == 'exact':
120 return f"tags_{lang}_f"
122 # CKAN core text fields: fts only
123 if ui_field in CKAN_FTS_FIELDS:
124 if param_kind == 'fts':
125 return f"{ui_field}_{lang}_txt"
126 # ignore exact_ for these
127 return None
129 # Core string facets
130 if ui_field in CORE_STRING_FACETS:
131 if param_kind == 'exact':
132 return ui_field
133 # enable FTS via _ngram (Not supported in current schema)
134 # if param_kind == 'fts':
135 # return f"{ui_field}_ngram"
137 if ui_field == 'portal_type':
138 if param_kind == 'exact':
139 return 'extras_portal_type'
141 # Everything else (selects / numbers / dates etc) use extras_*
142 if param_kind in ('min', 'max'):
143 return f"extras_{ui_field}"
144 # exact match for non-text -> extras_*
145 if param_kind == 'exact':
146 return f"extras_{ui_field}"
147 # fts on non-text: allow user-entered terms as exact values on extras_*
148 if param_kind == 'fts':
149 return f"extras_{ui_field}"
151 return None
154def _solr_field_for_facet_param(param: str, lang: str,
155 text_fields: set[str]) -> str | None:
156 """Map plain facet query params from the sidebar to concrete Solr fields."""
157 if param.startswith("extras_"):
158 return param
160 return _solr_field_for('exact', param, lang, text_fields)
162def _get_search_details() -> dict[str, Any]:
163 fq = u''
165 # fields_grouped will contain a dict of params containing
166 # a list of values eg {u'tags':[u'tag1', u'tag2']}
168 fields = []
169 fields_grouped = {} # key: solr_field -> { ui, values|min|max, fts:bool }
170 filter_logics = {} # key: ui_field -> 'AND' or 'OR'
171 include_undefined = set() # set of solr field names for date/number 'include empty'
172 search_extras: 'MultiDict[str, Any]' = MultiDict()
174 udc = plugins.get_plugin('udc')
175 # Get the list of text fields from the udc plugin
176 # Only the text fields support full text search
177 text_fields = set(udc.text_fields)
179 # Get the list of date fields from the udc plugin
180 date_fields = udc.date_fields
182 lang = request.args.get('lang') or get_current_lang()
185 # Solr type = string
186 # FTS + exact
187 # For FTS match, we need to add suffix `_ngram` to the field name (ckan solr schema is altered to support this)
188 ckan_fields_exact = ["tags", "organization", "license_id"]
190 for (param, value) in request.args.items(multi=True):
191 # Keep the primary query and internal params out of custom facet parsing.
192 if param in [u'q', u'page', u'sort'] or not len(value) or param.startswith(u'_'):
193 continue
195 # Ignore internal parameters
196 print(chalk.green(f"Param: {param}, Value: {value}"))
198 # Toggle logic
199 if param.startswith('filter-logic-'):
200 ui_name = param[13:]
201 if value.lower() == 'and':
202 filter_logics[ui_name] = 'AND'
203 elif value == 'date':
204 # include undefined date
205 solr_key = _solr_field_for('min', ui_name, lang, text_fields) # same base field
206 if solr_key:
207 include_undefined.add(solr_key)
208 elif value == 'number':
209 solr_key = _solr_field_for('min', ui_name, lang, text_fields)
210 if solr_key:
211 include_undefined.add(solr_key)
212 continue
214 # fts_*
215 if param.startswith('fts_'):
216 ui_name = param[4:]
217 solr_key = _solr_field_for('fts', ui_name, lang, text_fields)
218 if not solr_key:
219 continue
220 fields_grouped.setdefault(solr_key, {'ui': ui_name, 'fts': True, 'values': []})
221 fields_grouped[solr_key]['values'].append(value)
222 continue
224 # exact_*
225 if param.startswith('exact_'):
226 ui_name = param[6:]
227 solr_key = _solr_field_for('exact', ui_name, lang, text_fields)
228 if not solr_key:
229 continue
230 fields_grouped.setdefault(solr_key, {'ui': ui_name, 'fts': False, 'values': []})
231 fields_grouped[solr_key]['values'].append(value)
232 continue
234 # min_/max_ (numbers/dates on extras_*)
235 if param.startswith('min_'):
236 ui_name = param[4:]
237 solr_key = _solr_field_for('min', ui_name, lang, text_fields)
238 if not solr_key:
239 continue
240 fields_grouped.setdefault(solr_key, {'ui': ui_name})
241 fields_grouped[solr_key]['min'] = value
242 continue
244 if param.startswith('max_'):
245 ui_name = param[4:]
246 solr_key = _solr_field_for('max', ui_name, lang, text_fields)
247 if not solr_key:
248 continue
249 fields_grouped.setdefault(solr_key, {'ui': ui_name})
250 fields_grouped[solr_key]['max'] = value
251 continue
253 # Plain facet params from CKAN sidebar links use stable outward keys.
254 # Convert them into exact-match Solr filters instead of dropping them.
255 solr_key = _solr_field_for_facet_param(param, lang, text_fields)
256 if solr_key:
257 ui_name = param[7:] if param.startswith('extras_') else param
258 fields_grouped.setdefault(solr_key, {'ui': ui_name, 'fts': False, 'values': []})
259 fields_grouped[solr_key]['values'].append(value)
260 continue
262 # legacy / unknown -> pass-through as extras
263 if not param.startswith(u'ext_'):
264 fields.append((param, value))
265 else:
266 search_extras.update({param: value})
268 # Build fq
269 from datetime import datetime
270 for solr_key, opts in fields_grouped.items():
271 # values group
272 if 'values' in opts:
273 vals = opts['values']
274 ui_name = opts.get('ui', solr_key)
275 logic_op = filter_logics.get(ui_name, 'OR')
276 if len(vals) > 1:
277 joined = f' {logic_op} '.join([f'"{v}"' for v in vals])
278 fq += f' {solr_key}:({joined})'
279 else:
280 fq += f' {solr_key}:"{vals[0]}"'
281 continue
283 # range group (dates / numbers on extras_*)
284 _min = opts.get('min')
285 _max = opts.get('max')
286 # Date normalization if the underlying UI field was a date field
287 ui_name = opts.get('ui')
288 if ui_name and solr_key.startswith("extras_") and ui_name in date_fields:
289 try:
290 # Convert date to UTC ISO format
291 if _min:
292 _min = datetime.strptime(_min, '%Y-%m-%d').strftime('%Y-%m-%dT%H:%M:%SZ')
293 if _max:
294 d = datetime.strptime(_max, '%Y-%m-%d')
295 # Add 23:59:59 to the max date to include the whole day
296 _max = d.replace(hour=23, minute=59, second=59).strftime('%Y-%m-%dT%H:%M:%SZ')
297 except ValueError:
298 # If the date is not in the correct format, skip it
299 continue
301 # Handle min and max values for number and date ranges
302 if _min and _max:
303 range_query = f' {solr_key}:[{_min} TO {_max}]'
304 elif _min:
305 range_query = f' {solr_key}:[{_min} TO *]'
306 elif _max:
307 range_query = f' {solr_key}:[* TO {_max}]'
308 else:
309 range_query = ""
311 # Handle undefined date and number ranges
312 if range_query:
313 if solr_key in include_undefined:
314 range_query = f'({range_query} OR (*:* -{solr_key}:[* TO *]))'
315 fq += range_query
317 extras = dict((k, v[0]) if len(v) == 1 else (k, v)
318 for k, v in search_extras.lists())
320 return {
321 u'fields': fields,
322 u'fields_grouped': fields_grouped,
323 u'fq': fq,
324 u'search_extras': extras,
325 'filter_logics': filter_logics
326 }
329def _facet_alias_map(facet_keys: list[str], lang: str) -> tuple[list[str], dict[str, str]]:
330 """
331 Given stable facet keys, return (solr_facet_fields, alias_to_solr_map).
332 - text maturity fields in udc.text_fields -> <name>_<lang>_f
333 - tags -> tags_<lang>_f
334 - extras_* and other core facets -> as-is
335 """
336 udc = plugins.get_plugin('udc')
337 text_fields = set(udc.text_fields or [])
339 alias_to_solr = OrderedDict()
340 for key in facet_keys:
341 if key == "tags":
342 alias_to_solr[key] = f"tags_{lang}_f"
343 elif key == "portal_type":
344 alias_to_solr[key] = "extras_portal_type"
345 elif key.startswith("extras_"):
346 alias_to_solr[key] = key
347 elif key in text_fields:
348 alias_to_solr[key] = f"{key}_{lang}_f"
349 else:
350 alias_to_solr[key] = key
352 solr_fields = list(dict.fromkeys(alias_to_solr.values()))
353 return solr_fields, alias_to_solr
356@bp.route(
357 "/catalogue",
358 endpoint="search",
359 strict_slashes=False
360)
361def custom_dataset_search():
362 package_type = 'catalogue'
363 extra_vars: dict[str, Any] = {}
365 try:
366 context = cast(Context, {
367 u'model': model,
368 u'user': current_user.name,
369 u'auth_user_obj': current_user
370 })
371 logic.check_access(u'site_read', context)
372 except logic.NotAuthorized:
373 base.abort(403, _(u'Not authorized to see this page'))
375 # unicode format (decoded from utf8)
376 extra_vars[u'q'] = q = request.args.get(u'q', u'')
378 extra_vars['query_error'] = False
379 page = h.get_page_number(request.args)
381 limit = config.get(u'ckan.datasets_per_page')
383 # print(chalk.green(f"Page: {page}, Limit: {limit}, Query: {q}, Package Type: {package_type}, Current User: {current_user.name}"))
385 # most search operations should reset the page counter:
386 params_nopage = [(k, v) for k, v in request.args.items(multi=True)
387 if k != u'page']
389 # remove_field is a partial function that will remove a field from the search
390 # results. It is used in the search results template to generate links that
391 # remove a field from the search results.
392 extra_vars[u'remove_field'] = partial(remove_field, package_type)
394 # print("Remove field: ", extra_vars[u'remove_field'])
396 sort_by = request.args.get(u'sort', None)
397 params_nosort = [(k, v) for k, v in params_nopage if k != u'sort']
399 extra_vars[u'sort_by'] = partial(_sort_by, params_nosort, package_type)
400 # print("Sort by: ", sort_by)
402 if not sort_by:
403 sort_by_fields = []
404 else:
405 sort_by_fields = [field.split()[0] for field in sort_by.split(u',')]
406 extra_vars[u'sort_by_fields'] = sort_by_fields
408 pager_url = partial(_pager_url, params_nopage, package_type)
410 details = _get_search_details()
411 print(details)
412 extra_vars[u'fields'] = details[u'fields']
413 extra_vars[u'fields_grouped'] = details[u'fields_grouped']
414 extra_vars[u'filter_logics'] = details[u'filter_logics']
415 fq = details[u'fq']
416 search_extras = details[u'search_extras']
418 context = cast(Context, {
419 u'model': model,
420 u'session': model.Session,
421 u'user': current_user.name,
422 u'for_view': True,
423 u'auth_user_obj': current_user
424 })
426 # Unless changed via config options, don't show other dataset
427 # types any search page. Potential alternatives are do show them
428 # on the default search page (dataset) or on one other search page
429 search_all_type = config.get(u'ckan.search.show_all_types')
430 search_all = False
432 try:
433 # If the "type" is set to True or False, convert to bool
434 # and we know that no type was specified, so use traditional
435 # behaviour of applying this only to dataset type
436 search_all = asbool(search_all_type)
437 search_all_type = u'dataset'
438 # Otherwise we treat as a string representing a type
439 except ValueError:
440 search_all = True
442 if not search_all or package_type != search_all_type:
443 # Only show datasets of this particular type
444 fq += u' +dataset_type:{type}'.format(type=package_type)
446 facets: dict[str, str] = OrderedDict()
448 org_label = h.humanize_entity_type(
449 u'organization',
450 h.default_group_type(u'organization'),
451 u'facet label') or _(u'Organizations')
453 group_label = h.humanize_entity_type(
454 u'group',
455 h.default_group_type(u'group'),
456 u'facet label') or _(u'Groups')
458 default_facet_titles = {
459 u'organization': org_label,
460 u'groups': group_label,
461 u'tags': _(u'Tags'),
462 u'res_format': _(u'Formats'),
463 u'license_id': _(u'Licenses'),
464 }
466 for facet in h.facets():
467 if facet in default_facet_titles:
468 facets[facet] = default_facet_titles[facet]
469 else:
470 facets[facet] = facet
472 # Facet titles
473 for plugin in plugins.PluginImplementations(plugins.IFacets):
474 facets = plugin.dataset_facets(facets, package_type)
476 facet_fields = list(facets.keys())
477 # # Remove date facet as it is not supported by solr
478 # for date_field in h.date_fields:
479 # if date_field in facets:
480 # print("Removing date field: ", date_field)
481 # facet_fields.remove(date_field)
482 # facets.pop(date_field)
483 # print("Facet Fields: ", facet_fields)
485 extra_vars[u'facet_titles'] = facets
486 # extra_vars[u'facet_titles'].update(plugins.get_plugin('udc').facet_titles)
487 # print(chalk.yellow(f"Facet Titles: {extra_vars[u'facet_titles']}"))
489 lang = request.args.get('lang') or get_current_lang()
490 facet_fields_stable = list(facets.keys())
491 solr_facet_fields, alias_to_solr = _facet_alias_map(facet_fields_stable, lang)
493 data_dict: dict[str, Any] = {
494 u'q': q,
495 u'fq': fq.strip(),
496 u'facet.field': solr_facet_fields,
497 # u'facet.limit': -1,
498 u'rows': limit,
499 u'start': (page - 1) * limit,
500 u'sort': sort_by,
501 u'extras': search_extras,
502 u'include_private': config.get(
503 u'ckan.search.default_include_private'),
504 }
505 # print(chalk.green(f"Data Dict: {data_dict}"))
506 try:
507 query = logic.get_action(u'package_search')(context, data_dict)
509 extra_vars[u'sort_by_selected'] = query[u'sort']
511 extra_vars[u'page'] = Page(
512 collection=query[u'results'],
513 page=page,
514 url=pager_url,
515 item_count=query[u'count'],
516 items_per_page=limit
517 )
518 # print(chalk.red("search_facets"), query[u'search_facets'])
520 raw_facets = query.get('search_facets', {})
521 search_facets_stable = {}
522 for alias, solr_name in alias_to_solr.items():
523 if solr_name in raw_facets:
524 search_facets_stable[alias] = raw_facets[solr_name]
526 extra_vars[u'search_facets'] = search_facets_stable
527 # extra_vars[u'search_facets'] = query[u'search_facets']
528 extra_vars[u'page'].items = query[u'results']
529 except SearchQueryError as se:
530 # User's search parameters are invalid, in such a way that is not
531 # achievable with the web interface, so return a proper error to
532 # discourage spiders which are the main cause of this.
533 log.info(u'Dataset search query rejected: %r', se.args)
534 base.abort(
535 400,
536 _(u'Invalid search query: {error_message}')
537 .format(error_message=str(se))
538 )
539 except SearchError as se:
540 # May be bad input from the user, but may also be more serious like
541 # bad code causing a SOLR syntax error, or a problem connecting to
542 # SOLR
543 log.error(u'Dataset search error: %r', se.args)
544 extra_vars[u'query_error'] = True
545 extra_vars[u'search_facets'] = {}
546 extra_vars[u'page'] = Page(collection=[])
548 # FIXME: try to avoid using global variables
549 g.search_facets_limits = {}
550 default_limit: int = config.get(u'search.facets.default')
551 for facet in cast(Iterable[str], extra_vars[u'search_facets'].keys()):
552 try:
553 limit = int(
554 request.args.get(
555 u'_%s_limit' % facet,
556 default_limit
557 )
558 )
559 except ValueError:
560 base.abort(
561 400,
562 _(u'Parameter u"{parameter_name}" is not '
563 u'an integer').format(parameter_name=u'_%s_limit' % facet)
564 )
566 g.search_facets_limits[facet] = limit
568 _setup_template_variables(context, {}, package_type=package_type)
570 extra_vars[u'dataset_type'] = package_type
572 # TODO: remove
573 for key, value in extra_vars.items():
574 setattr(g, key, value)
576 # print(chalk.green(f"Extra Vars: {extra_vars}"))
577 return base.render('package/custom_search.html', extra_vars)
580# Redirect /dataset to /catalogue
581@bp.route(
582 "/dataset",
583 endpoint="redirect-search",
584 strict_slashes=False
585)
586def redirect_to_catalogue_search():
587 # Redirect to the catalogue search page
588 new_path = request.path.replace("/dataset", "/catalogue", 1)
589 query = request.query_string.decode("utf-8") if request.query_string else ""
590 lang = request.environ.get("CKAN_LANG")
591 is_default = request.environ.get("CKAN_LANG_IS_DEFAULT", True)
592 if lang and not is_default and not new_path.startswith(f"/{lang}/"):
593 new_path = f"/{lang}{new_path}"
594 new_url = f"{new_path}?{query}" if query else new_path
595 return tk.redirect_to(new_url)
599# Blueprint for raw graph endpoints
600graph_blueprint = Blueprint('udc_graph', __name__)
603@graph_blueprint.route('/catalogue/<package_id>/graph')
604@graph_blueprint.route('/catalogue/<package_id>/graph.<format>')
605def package_graph(package_id, format=None):
606 """
607 Return the knowledge graph for a package in raw RDF format.
609 URL: /catalogue/<package_id>/graph[.format]
610 """
611 from flask import Response, request, abort
613 # Get format from URL extension or query parameter
614 if not format:
615 format = request.args.get('format', 'turtle')
617 # Map file extensions to format names
618 format_mapping = {
619 'ttl': 'turtle',
620 'turtle': 'turtle',
621 'jsonld': 'json-ld',
622 'json-ld': 'json-ld',
623 'rdf': 'xml',
624 'xml': 'xml',
625 'n3': 'n3',
626 'nt': 'nt',
627 }
629 format = format_mapping.get(format.lower(), format.lower())
631 # Validate format
632 valid_formats = {'turtle', 'json-ld', 'xml', 'n3', 'nt', 'pretty-xml'}
633 if format not in valid_formats:
634 abort(400, description=f"Invalid format '{format}'. Must be one of: {', '.join(valid_formats)}")
636 # Check if GraphDB is disabled
637 if plugins.get_plugin('udc').disable_graphdb:
638 abort(503, description='Knowledge graph feature is disabled. GraphDB connection is not available.')
640 # Check access - use package_show authorization
641 try:
642 tk.check_access('package_show', {'user': current_user.name if current_user else None},
643 {'id': package_id})
644 except tk.NotAuthorized:
645 abort(403, description='Not authorized to access this package')
646 except tk.ObjectNotFound:
647 abort(404, description=f'Package not found: {package_id}')
649 # Get the graph
650 try:
651 graph_data = get_catalogue_graph(package_id, format)
652 except ValueError as e:
653 log.error(f"Error retrieving graph for package {package_id}: {str(e)}")
654 abort(404, description=str(e))
655 except Exception as e:
656 log.error(f"Unexpected error retrieving graph for package {package_id}: {str(e)}")
657 abort(500, description='Error occurred while retrieving knowledge graph')
659 # Set appropriate content type
660 content_types = {
661 'turtle': 'text/turtle; charset=utf-8',
662 'json-ld': 'application/ld+json; charset=utf-8',
663 'xml': 'application/rdf+xml; charset=utf-8',
664 'pretty-xml': 'application/rdf+xml; charset=utf-8',
665 'n3': 'text/n3; charset=utf-8',
666 'nt': 'application/n-triples; charset=utf-8',
667 }
669 content_type = content_types.get(format, 'text/plain; charset=utf-8')
671 return Response(graph_data, mimetype=content_type)