Coverage for ckanext/udc/version/actions.py: 17%
78 statements
« prev ^ index » next coverage.py v7.7.1, created at 2026-01-19 23:48 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2026-01-19 23:48 +0000
1from __future__ import annotations
3import logging
4from typing import Any, Dict
6from bs4 import BeautifulSoup # type: ignore
8import ckan.logic as logic
9from ckan.types import Context
10from ckan.common import _, config
11from ckan.lib.navl.dictization_functions import DataError
13import requests
15log = logging.getLogger(__name__)
18def _scrape_html_title_description(url: str) -> Dict[str, str]:
19 """Best-effort scrape of <title> and a description from an HTML page.
21 This is intentionally simple and defensive: it fetches the URL with a
22 short timeout, parses HTML with BeautifulSoup if available, and tries
23 to extract:
25 - <title>
26 - <meta name="description"> or <meta property="og:description">
28 Any errors result in an empty dict.
29 """
30 headers = {
31 "User-Agent": config.get(
32 "udc.version_meta.user_agent",
33 "CKAN-UDC-VersionMeta/1.0 (+https://ckan.org)",
34 )
35 }
36 try:
37 resp = requests.get(url, headers=headers, timeout=5)
38 resp.raise_for_status()
39 except Exception as e:
40 log.warning("udc_version_meta: error fetching %s: %s", url, e)
41 return {}
43 content_type = resp.headers.get("Content-Type", "")
44 if "html" not in content_type.lower():
45 return {}
47 try:
48 soup = BeautifulSoup(resp.text, "html.parser")
49 except Exception as e: # pragma: no cover - extremely unlikely
50 log.warning("udc_version_meta: error parsing HTML from %s: %s", url, e)
51 return {}
53 title = ""
54 desc = ""
56 if soup.title and soup.title.string:
57 title = soup.title.string.strip()
59 meta_desc = soup.find("meta", attrs={"name": "description"}) or soup.find(
60 "meta", attrs={"property": "og:description"}
61 )
62 if meta_desc:
63 desc = meta_desc.get("content") or ""
64 desc = desc.strip()
66 out: Dict[str, str] = {}
67 if title:
68 out["title"] = title
69 if desc:
70 out["description"] = desc
71 return out
74def _extract_cudc_dataset_name(url: str) -> str | None:
75 """Extract dataset name from a CUDC catalogue URL if possible.
77 Expected forms include:
78 - /catalogue/<name>
79 - /catalogue/<type>/<name>
80 """
82 try:
83 from urllib.parse import urlparse
84 except Exception: # pragma: no cover
85 return None
87 parsed = urlparse(url)
88 parts = [p for p in parsed.path.split("/") if p]
89 # find 'catalogue' segment and take the following as name (or the next)
90 for idx, part in enumerate(parts):
91 if part == "catalogue" and idx + 1 < len(parts):
92 # If there are two segments after 'catalogue', the last one is likely the dataset
93 if idx + 2 < len(parts):
94 return parts[idx + 2]
95 return parts[idx + 1]
96 return None
99def udc_version_meta(context: Context, data_dict: Dict[str, Any]) -> Dict[str, Any]:
100 """Return title/description for a dataset-like URL.
102 For CUDC catalogue URLs, resolve via CKAN's package_show.
103 For other URLs, perform a lightweight HTML scrape.
104 """
106 url = (data_dict.get("url") or "").strip()
107 if not url:
108 raise logic.ValidationError({"url": [_("Missing url")]})
110 # Optional: require logged-in user
111 user = context.get("user")
112 if not user:
113 raise logic.NotAuthorized(_("You must be logged in to use this action"))
115 # 1) Try to resolve CUDC catalogue entries via package_show
116 result: Dict[str, Any] = {}
117 if "/catalogue/" in url:
118 name = _extract_cudc_dataset_name(url)
119 if name:
120 try:
121 pkg = logic.get_action("package_show")(context, {"id": name})
122 title = pkg.get("title") or pkg.get("name") or ""
123 desc = pkg.get("summary") or pkg.get("notes") or ""
124 if title:
125 result["title"] = title
126 if desc:
127 result["description"] = desc
128 except logic.NotFound:
129 log.info("udc_version_meta: package %s not found for url %s", name, url)
130 except Exception as e:
131 log.warning(
132 "udc_version_meta: error resolving package %s for url %s: %s",
133 name,
134 url,
135 e,
136 )
138 # 2) If nothing yet, and allowed by config, scrape the URL
139 if not result:
140 allow_scrape = config.get("udc.version_meta.scrape", "true").lower() in (
141 "true",
142 "1",
143 "yes",
144 )
145 if allow_scrape:
146 scraped = _scrape_html_title_description(url)
147 if scraped:
148 result.update(scraped)
150 return result