Coverage for ckanext/udc/views.py: 13%

1# -*- coding: utf-8 -*-

2from __future__ import annotations

3import logging

5from datetime import datetime

6from collections import OrderedDict

7from functools import partial

8from typing import Any, Iterable, Optional, Union, cast

9from ckanext.udc.graph.logic import get_catalogue_graph

10from werkzeug.datastructures import MultiDict

12from flask import Blueprint

14import ckan.lib.base as base

15from ckan.lib.helpers import helper_functions as h

16from ckan.lib.helpers import Page

17import ckan.lib.navl.dictization_functions as dict_fns

18import ckan.logic as logic

19import ckan.model as model

20import ckan.plugins as plugins

21import ckan.authz as authz

22import ckan.plugins.toolkit as tk

23import ckan.model as model

25from ckan.lib.helpers import Page

26from ckan.common import asbool, current_user, CKANConfig, request, g, config, _

27from ckan.views.dataset import _sort_by, _pager_url, _setup_template_variables, _get_pkg_template

28from ckan.lib.search import SearchQueryError, SearchError

30from ckan.types import Context, Response

32import chalk

33import re

34from urllib.parse import urlencode

35from ckanext.udc.solr.config import get_current_lang

37log = logging.getLogger(__name__)

38bp = Blueprint("catalogue_search", __name__)

42def remove_field(package_type: Optional[str],

43 key: str,

44 value: Optional[str] = None,

45 replace: Optional[str] = None):

46 if not package_type:

47 package_type = u'dataset'

48 url = h.url_for(u'{0}.search'.format(package_type))

50 if key.startswith('extras_'):

51 # Remove the extras_ prefix

52 key = key[7:]

53 elif key.endswith('_ngram'):

54 # Remove the _ngram suffix

55 key = key[:-6]

57 params_items = request.args.items(multi=True)

58 params_nopage = [

59 (k, v) for k, v in params_items

60 if k != 'page'

61 ]

62 params = list(params_nopage)

63 if value:

64 # Assume `fts_` and `exact_` cannot be used together with the same value

65 # print(chalk.red(f"Genererate remove link for Key: {key}, Value: {value}"))

66 if (f"exact_{key}", value) in params:

67 params.remove((f"exact_{key}", value))

68 if (f"fts_{key}", value) in params:

69 params.remove((f"fts_{key}", value))

70 if (key, value) in params:

71 params.remove((key, value))

73 else:

74 for (k, v) in params[:]:

75 if k == key:

76 if (f"exact_{key}", value) in params:

77 params.remove((f"exact_{key}", value))

78 if (f"fts_{key}", value) in params:

79 params.remove((f"fts_{key}", value))

80 if (k, v) in params:

81 params.remove((k, v))

83 if replace is not None:

84 params.append((key, replace))

86 params = [(k, v.encode('utf-8') if isinstance(v, str) else str(v))

87 for k, v in params]

88 return url + u'?' + urlencode(params)

91# Solr type = text_general/text

92# FTS only

93# TODO: If exact match is needed, we need to alter the ckan solr schema

94CKAN_FTS_FIELDS = ["title", "notes", "url", "version",

95 "author", "author_email", "maintainer", "maintainer_email"]

97# Core string facets: exact match; optional _ngram if you enable Option B

98CORE_STRING_FACETS = ["organization", "license_id"]

100def _solr_field_for(param_kind: str, ui_field: str, lang: str,

101 text_fields: set[str]) -> str | None:

102 """

103 Map stable UI field names to concrete Solr fields.

104 param_kind: 'fts' | 'exact' | 'min' | 'max'

105 """

106 # Text maturity fields (multilingual)

107 if ui_field in text_fields:

108 if param_kind == 'fts':

109 return f"{ui_field}_{lang}_txt"

110 if param_kind == 'exact':

111 return f"{ui_field}_{lang}_f"

112 if param_kind in ('min', 'max'):

113 return f"extras_{ui_field}"

114

115 # Tags are multilingual too

116 if ui_field == "tags":

117 if param_kind == 'fts':

118 return f"tags_{lang}_txt"

119 if param_kind == 'exact':

120 return f"tags_{lang}_f"

121

122 # CKAN core text fields: fts only

123 if ui_field in CKAN_FTS_FIELDS:

124 if param_kind == 'fts':

125 return f"{ui_field}_{lang}_txt"

126 # ignore exact_ for these

127 return None

128

129 # Core string facets

130 if ui_field in CORE_STRING_FACETS:

131 if param_kind == 'exact':

132 return ui_field

133 # enable FTS via _ngram (Not supported in current schema)

134 # if param_kind == 'fts':

135 # return f"{ui_field}_ngram"

136

137 # Everything else (selects / numbers / dates etc) use extras_*

138 if param_kind in ('min', 'max'):

139 return f"extras_{ui_field}"

140 # exact match for non-text -> extras_*

141 if param_kind == 'exact':

142 return f"extras_{ui_field}"

143 # fts on non-text: allow user-entered terms as exact values on extras_*

144 if param_kind == 'fts':

145 return f"extras_{ui_field}"

146

147 return None

148

149def _get_search_details() -> dict[str, Any]:

150 fq = u''

151

152 # fields_grouped will contain a dict of params containing

153 # a list of values eg {u'tags':[u'tag1', u'tag2']}

154

155 fields = []

156 fields_grouped = {} # key: solr_field -> { ui, values|min|max, fts:bool }

157 filter_logics = {} # key: ui_field -> 'AND' or 'OR'

158 include_undefined = set() # set of solr field names for date/number 'include empty'

159 search_extras: 'MultiDict[str, Any]' = MultiDict()

160

161 udc = plugins.get_plugin('udc')

162 # Get the list of text fields from the udc plugin

163 # Only the text fields support full text search

164 text_fields = udc.text_fields

165

166 # Get the list of date fields from the udc plugin

167 date_fields = udc.date_fields

168

169 lang = request.args.get('lang') or get_current_lang()

170

171

172 # Solr type = string

173 # FTS + exact

174 # For FTS match, we need to add suffix `_ngram` to the field name (ckan solr schema is altered to support this)

175 ckan_fields_exact = ["tags", "organization", "license_id"]

176

177 for (param, value) in request.args.items(multi=True):

178 # Ignore internal parameters

179 if param not in [u'q', u'page', u'sort'] and len(value) and not param.startswith(u'_'):

180 print(chalk.green(f"Param: {param}, Value: {value}"))

181

182 # Toggle logic

183 if param.startswith('filter-logic-'):

184 ui_name = param[13:]

185 if value.lower() == 'and':

186 filter_logics[ui_name] = 'AND'

187 elif value == 'date':

188 # include undefined date

189 solr_key = _solr_field_for('min', ui_name, lang, text_fields) # same base field

190 if solr_key:

191 include_undefined.add(solr_key)

192 elif value == 'number':

193 solr_key = _solr_field_for('min', ui_name, lang, text_fields)

194 if solr_key:

195 include_undefined.add(solr_key)

196 continue

197

198 # fts_*

199 if param.startswith('fts_'):

200 ui_name = param[4:]

201 solr_key = _solr_field_for('fts', ui_name, lang, text_fields)

202 if not solr_key:

203 continue

204 fields_grouped.setdefault(solr_key, {'ui': ui_name, 'fts': True, 'values': []})

205 fields_grouped[solr_key]['values'].append(value)

206 continue

207

208 # exact_*

209 if param.startswith('exact_'):

210 ui_name = param[6:]

211 solr_key = _solr_field_for('exact', ui_name, lang, text_fields)

212 if not solr_key:

213 continue

214 fields_grouped.setdefault(solr_key, {'ui': ui_name, 'fts': False, 'values': []})

215 fields_grouped[solr_key]['values'].append(value)

216 continue

217

218 # min_/max_ (numbers/dates on extras_*)

219 if param.startswith('min_'):

220 ui_name = param[4:]

221 solr_key = _solr_field_for('min', ui_name, lang, text_fields)

222 if not solr_key:

223 continue

224 fields_grouped.setdefault(solr_key, {'ui': ui_name})

225 fields_grouped[solr_key]['min'] = value

226 continue

227

228 if param.startswith('max_'):

229 ui_name = param[4:]

230 solr_key = _solr_field_for('max', ui_name, lang, text_fields)

231 if not solr_key:

232 continue

233 fields_grouped.setdefault(solr_key, {'ui': ui_name})

234 fields_grouped[solr_key]['max'] = value

235 continue

236

237 # legacy / unknown -> pass-through as extras

238 if not param.startswith(u'ext_'):

239 fields.append((param, value))

240 else:

241 search_extras.update({param: value})

242

243 # Build fq

244 from datetime import datetime

245 for solr_key, opts in fields_grouped.items():

246 # values group

247 if 'values' in opts:

248 vals = opts['values']

249 ui_name = opts.get('ui', solr_key)

250 logic_op = filter_logics.get(ui_name, 'OR')

251 if len(vals) > 1:

252 joined = f' {logic_op} '.join([f'"{v}"' for v in vals])

253 fq += f' {solr_key}:({joined})'

254 else:

255 fq += f' {solr_key}:"{vals[0]}"'

256 continue

257

258 # range group (dates / numbers on extras_*)

259 _min = opts.get('min')

260 _max = opts.get('max')

261 # Date normalization if the underlying UI field was a date field

262 ui_name = opts.get('ui')

263 if ui_name and solr_key.startswith("extras_") and ui_name in date_fields:

264 try:

265 # Convert date to UTC ISO format

266 if _min:

267 _min = datetime.strptime(_min, '%Y-%m-%d').strftime('%Y-%m-%dT%H:%M:%SZ')

268 if _max:

269 d = datetime.strptime(_max, '%Y-%m-%d')

270 # Add 23:59:59 to the max date to include the whole day

271 _max = d.replace(hour=23, minute=59, second=59).strftime('%Y-%m-%dT%H:%M:%SZ')

272 except ValueError:

273 # If the date is not in the correct format, skip it

274 continue

275

276 # Handle min and max values for number and date ranges

277 if _min and _max:

278 range_query = f' {solr_key}:[{_min} TO {_max}]'

279 elif _min:

280 range_query = f' {solr_key}:[{_min} TO *]'

281 elif _max:

282 range_query = f' {solr_key}:[* TO {_max}]'

283 else:

284 range_query = ""

285

286 # Handle undefined date and number ranges

287 if range_query:

288 if solr_key in include_undefined:

289 range_query = f'({range_query} OR (*:* -{solr_key}:[* TO *]))'

290 fq += range_query

291

292 extras = dict((k, v[0]) if len(v) == 1 else (k, v)

293 for k, v in search_extras.lists())

294

295 return {

296 u'fields': fields,

297 u'fields_grouped': fields_grouped,

298 u'fq': fq,

299 u'search_extras': extras,

300 'filter_logics': filter_logics

301 }

302

303

304def _facet_alias_map(facet_keys: list[str], lang: str) -> tuple[list[str], dict[str, str]]:

305 """

306 Given stable facet keys, return (solr_facet_fields, alias_to_solr_map).

307 - text maturity fields in udc.text_fields -> <name>_<lang>_f

308 - tags -> tags_<lang>_f

309 - extras_* and other core facets -> as-is

310 """

311 udc = plugins.get_plugin('udc')

312 text_fields = set(udc.text_fields or [])

313

314 alias_to_solr = OrderedDict()

315 for key in facet_keys:

316 if key == "tags":

317 alias_to_solr[key] = f"tags_{lang}_f"

318 elif key.startswith("extras_"):

319 alias_to_solr[key] = key

320 elif key in text_fields:

321 alias_to_solr[key] = f"{key}_{lang}_f"

322 else:

323 alias_to_solr[key] = key

324

325 solr_fields = list(dict.fromkeys(alias_to_solr.values()))

326 return solr_fields, alias_to_solr

327

328

329@bp.route(

330 "/catalogue",

331 endpoint="search",

332 strict_slashes=False

333)

334def custom_dataset_search():

335 package_type = 'catalogue'

336 extra_vars: dict[str, Any] = {}

337

338 try:

339 context = cast(Context, {

340 u'model': model,

341 u'user': current_user.name,

342 u'auth_user_obj': current_user

343 })

344 logic.check_access(u'site_read', context)

345 except logic.NotAuthorized:

346 base.abort(403, _(u'Not authorized to see this page'))

347

348 # unicode format (decoded from utf8)

349 extra_vars[u'q'] = q = request.args.get(u'q', u'')

350

351 extra_vars['query_error'] = False

352 page = h.get_page_number(request.args)

353

354 limit = config.get(u'ckan.datasets_per_page')

355

356 # print(chalk.green(f"Page: {page}, Limit: {limit}, Query: {q}, Package Type: {package_type}, Current User: {current_user.name}"))

357

358 # most search operations should reset the page counter:

359 params_nopage = [(k, v) for k, v in request.args.items(multi=True)

360 if k != u'page']

361

362 # remove_field is a partial function that will remove a field from the search

363 # results. It is used in the search results template to generate links that

364 # remove a field from the search results.

365 extra_vars[u'remove_field'] = partial(remove_field, package_type)

366

367 # print("Remove field: ", extra_vars[u'remove_field'])

368

369 sort_by = request.args.get(u'sort', None)

370 params_nosort = [(k, v) for k, v in params_nopage if k != u'sort']

371

372 extra_vars[u'sort_by'] = partial(_sort_by, params_nosort, package_type)

373 # print("Sort by: ", sort_by)

374

375 if not sort_by:

376 sort_by_fields = []

377 else:

378 sort_by_fields = [field.split()[0] for field in sort_by.split(u',')]

379 extra_vars[u'sort_by_fields'] = sort_by_fields

380

381 pager_url = partial(_pager_url, params_nopage, package_type)

382

383 details = _get_search_details()

384 print(details)

385 extra_vars[u'fields'] = details[u'fields']

386 extra_vars[u'fields_grouped'] = details[u'fields_grouped']

387 extra_vars[u'filter_logics'] = details[u'filter_logics']

388 fq = details[u'fq']

389 search_extras = details[u'search_extras']

390

391 context = cast(Context, {

392 u'model': model,

393 u'session': model.Session,

394 u'user': current_user.name,

395 u'for_view': True,

396 u'auth_user_obj': current_user

397 })

398

399 # Unless changed via config options, don't show other dataset

400 # types any search page. Potential alternatives are do show them

401 # on the default search page (dataset) or on one other search page

402 search_all_type = config.get(u'ckan.search.show_all_types')

403 search_all = False

404

405 try:

406 # If the "type" is set to True or False, convert to bool

407 # and we know that no type was specified, so use traditional

408 # behaviour of applying this only to dataset type

409 search_all = asbool(search_all_type)

410 search_all_type = u'dataset'

411 # Otherwise we treat as a string representing a type

412 except ValueError:

413 search_all = True

414

415 if not search_all or package_type != search_all_type:

416 # Only show datasets of this particular type

417 fq += u' +dataset_type:{type}'.format(type=package_type)

418

419 facets: dict[str, str] = OrderedDict()

420

421 org_label = h.humanize_entity_type(

422 u'organization',

423 h.default_group_type(u'organization'),

424 u'facet label') or _(u'Organizations')

425

426 group_label = h.humanize_entity_type(

427 u'group',

428 h.default_group_type(u'group'),

429 u'facet label') or _(u'Groups')

430

431 default_facet_titles = {

432 u'organization': org_label,

433 u'groups': group_label,

434 u'tags': _(u'Tags'),

435 u'res_format': _(u'Formats'),

436 u'license_id': _(u'Licenses'),

437 }

438

439 for facet in h.facets():

440 if facet in default_facet_titles:

441 facets[facet] = default_facet_titles[facet]

442 else:

443 facets[facet] = facet

444

445 # Facet titles

446 for plugin in plugins.PluginImplementations(plugins.IFacets):

447 facets = plugin.dataset_facets(facets, package_type)

448

449 facet_fields = list(facets.keys())

450 # # Remove date facet as it is not supported by solr

451 # for date_field in h.date_fields:

452 # if date_field in facets:

453 # print("Removing date field: ", date_field)

454 # facet_fields.remove(date_field)

455 # facets.pop(date_field)

456 # print("Facet Fields: ", facet_fields)

457

458 extra_vars[u'facet_titles'] = facets

459 # extra_vars[u'facet_titles'].update(plugins.get_plugin('udc').facet_titles)

460 # print(chalk.yellow(f"Facet Titles: {extra_vars[u'facet_titles']}"))

461

462 lang = request.args.get('lang') or get_current_lang()

463 facet_fields_stable = list(facets.keys())

464 solr_facet_fields, alias_to_solr = _facet_alias_map(facet_fields_stable, lang)

465

466 data_dict: dict[str, Any] = {

467 u'q': q,

468 u'fq': fq.strip(),

469 u'facet.field': solr_facet_fields,

470 # u'facet.limit': -1,

471 u'rows': limit,

472 u'start': (page - 1) * limit,

473 u'sort': sort_by,

474 u'extras': search_extras,

475 u'include_private': config.get(

476 u'ckan.search.default_include_private'),

477 }

478 # print(chalk.green(f"Data Dict: {data_dict}"))

479 try:

480 query = logic.get_action(u'package_search')(context, data_dict)

481

482 extra_vars[u'sort_by_selected'] = query[u'sort']

483

484 extra_vars[u'page'] = Page(

485 collection=query[u'results'],

486 page=page,

487 url=pager_url,

488 item_count=query[u'count'],

489 items_per_page=limit

490 )

491 # print(chalk.red("search_facets"), query[u'search_facets'])

492

493 raw_facets = query.get('search_facets', {})

494 search_facets_stable = {}

495 for alias, solr_name in alias_to_solr.items():

496 if solr_name in raw_facets:

497 search_facets_stable[alias] = raw_facets[solr_name]

498

499 extra_vars[u'search_facets'] = search_facets_stable

500 # extra_vars[u'search_facets'] = query[u'search_facets']

501 extra_vars[u'page'].items = query[u'results']

502 except SearchQueryError as se:

503 # User's search parameters are invalid, in such a way that is not

504 # achievable with the web interface, so return a proper error to

505 # discourage spiders which are the main cause of this.

506 log.info(u'Dataset search query rejected: %r', se.args)

507 base.abort(

508 400,

509 _(u'Invalid search query: {error_message}')

510 .format(error_message=str(se))

511 )

512 except SearchError as se:

513 # May be bad input from the user, but may also be more serious like

514 # bad code causing a SOLR syntax error, or a problem connecting to

515 # SOLR

516 log.error(u'Dataset search error: %r', se.args)

517 extra_vars[u'query_error'] = True

518 extra_vars[u'search_facets'] = {}

519 extra_vars[u'page'] = Page(collection=[])

520

521 # FIXME: try to avoid using global variables

522 g.search_facets_limits = {}

523 default_limit: int = config.get(u'search.facets.default')

524 for facet in cast(Iterable[str], extra_vars[u'search_facets'].keys()):

525 try:

526 limit = int(

527 request.args.get(

528 u'_%s_limit' % facet,

529 default_limit

530 )

531 )

532 except ValueError:

533 base.abort(

534 400,

535 _(u'Parameter u"{parameter_name}" is not '

536 u'an integer').format(parameter_name=u'_%s_limit' % facet)

537 )

538

539 g.search_facets_limits[facet] = limit

540

541 _setup_template_variables(context, {}, package_type=package_type)

542

543 extra_vars[u'dataset_type'] = package_type

544

545 # TODO: remove

546 for key, value in extra_vars.items():

547 setattr(g, key, value)

548

549 # print(chalk.green(f"Extra Vars: {extra_vars}"))

550 return base.render('package/custom_search.html', extra_vars)

551

552

553# Redirect /dataset to /catalogue

554@bp.route(

555 "/dataset",

556 endpoint="redirect-search",

557 strict_slashes=False

558)

559def redirect_to_catalogue_search():

560 # Redirect to the catalogue search page

561 new_path = request.path.replace("/dataset", "/catalogue", 1)

562 query = request.query_string.decode("utf-8") if request.query_string else ""

563 lang = request.environ.get("CKAN_LANG")

564 is_default = request.environ.get("CKAN_LANG_IS_DEFAULT", True)

565 if lang and not is_default and not new_path.startswith(f"/{lang}/"):

566 new_path = f"/{lang}{new_path}"

567 new_url = f"{new_path}?{query}" if query else new_path

568 return tk.redirect_to(new_url)

569

570

571

572# Blueprint for raw graph endpoints

573graph_blueprint = Blueprint('udc_graph', __name__)

574

575

576@graph_blueprint.route('/catalogue/<package_id>/graph')

577@graph_blueprint.route('/catalogue/<package_id>/graph.<format>')

578def package_graph(package_id, format=None):

579 """

580 Return the knowledge graph for a package in raw RDF format.

581

582 URL: /catalogue/<package_id>/graph[.format]

583 """

584 from flask import Response, request, abort

585

586 # Get format from URL extension or query parameter

587 if not format:

588 format = request.args.get('format', 'turtle')

589

590 # Map file extensions to format names

591 format_mapping = {

592 'ttl': 'turtle',

593 'turtle': 'turtle',

594 'jsonld': 'json-ld',

595 'json-ld': 'json-ld',

596 'rdf': 'xml',

597 'xml': 'xml',

598 'n3': 'n3',

599 'nt': 'nt',

600 }

601

602 format = format_mapping.get(format.lower(), format.lower())

603

604 # Validate format

605 valid_formats = {'turtle', 'json-ld', 'xml', 'n3', 'nt', 'pretty-xml'}

606 if format not in valid_formats:

607 abort(400, description=f"Invalid format '{format}'. Must be one of: {', '.join(valid_formats)}")

608

609 # Check if GraphDB is disabled

610 if plugins.get_plugin('udc').disable_graphdb:

611 abort(503, description='Knowledge graph feature is disabled. GraphDB connection is not available.')

612

613 # Check access - use package_show authorization

614 try:

615 tk.check_access('package_show', {'user': current_user.name if current_user else None},

616 {'id': package_id})

617 except tk.NotAuthorized:

618 abort(403, description='Not authorized to access this package')

619 except tk.ObjectNotFound:

620 abort(404, description=f'Package not found: {package_id}')

621

622 # Get the graph

623 try:

624 graph_data = get_catalogue_graph(package_id, format)

625 except ValueError as e:

626 log.error(f"Error retrieving graph for package {package_id}: {str(e)}")

627 abort(404, description=str(e))

628 except Exception as e:

629 log.error(f"Unexpected error retrieving graph for package {package_id}: {str(e)}")

630 abort(500, description='Error occurred while retrieving knowledge graph')

631

632 # Set appropriate content type

633 content_types = {

634 'turtle': 'text/turtle; charset=utf-8',

635 'json-ld': 'application/ld+json; charset=utf-8',

636 'xml': 'application/rdf+xml; charset=utf-8',

637 'pretty-xml': 'application/rdf+xml; charset=utf-8',

638 'n3': 'text/n3; charset=utf-8',

639 'nt': 'application/n-triples; charset=utf-8',

640 }

641

642 content_type = content_types.get(format, 'text/plain; charset=utf-8')

643

644 return Response(graph_data, mimetype=content_type)