Coverage for ckanext/udc/version/actions.py: 17%

1from __future__ import annotations

3import logging

4from typing import Any, Dict

6from bs4 import BeautifulSoup # type: ignore

8import ckan.logic as logic

9from ckan.types import Context

10from ckan.common import _, config

11from ckan.lib.navl.dictization_functions import DataError

13import requests

15log = logging.getLogger(__name__)

18def _scrape_html_title_description(url: str) -> Dict[str, str]:

19 """Best-effort scrape of <title> and a description from an HTML page.

21 This is intentionally simple and defensive: it fetches the URL with a

22 short timeout, parses HTML with BeautifulSoup if available, and tries

23 to extract:

25 - <title>

26 - <meta name="description"> or <meta property="og:description">

28 Any errors result in an empty dict.

29 """

30 headers = {

31 "User-Agent": config.get(

32 "udc.version_meta.user_agent",

33 "CKAN-UDC-VersionMeta/1.0 (+https://ckan.org)",

34 )

35 }

36 try:

37 resp = requests.get(url, headers=headers, timeout=5)

38 resp.raise_for_status()

39 except Exception as e:

40 log.warning("udc_version_meta: error fetching %s: %s", url, e)

41 return {}

43 content_type = resp.headers.get("Content-Type", "")

44 if "html" not in content_type.lower():

45 return {}

47 try:

48 soup = BeautifulSoup(resp.text, "html.parser")

49 except Exception as e: # pragma: no cover - extremely unlikely

50 log.warning("udc_version_meta: error parsing HTML from %s: %s", url, e)

51 return {}

53 title = ""

54 desc = ""

56 if soup.title and soup.title.string:

57 title = soup.title.string.strip()

59 meta_desc = soup.find("meta", attrs={"name": "description"}) or soup.find(

60 "meta", attrs={"property": "og:description"}

61 )

62 if meta_desc:

63 desc = meta_desc.get("content") or ""

64 desc = desc.strip()

66 out: Dict[str, str] = {}

67 if title:

68 out["title"] = title

69 if desc:

70 out["description"] = desc

71 return out

74def _extract_cudc_dataset_name(url: str) -> str | None:

75 """Extract dataset name from a CUDC catalogue URL if possible.

77 Expected forms include:

78 - /catalogue/<name>

79 - /catalogue/<type>/<name>

80 """

82 try:

83 from urllib.parse import urlparse

84 except Exception: # pragma: no cover

85 return None

87 parsed = urlparse(url)

88 parts = [p for p in parsed.path.split("/") if p]

89 # find 'catalogue' segment and take the following as name (or the next)

90 for idx, part in enumerate(parts):

91 if part == "catalogue" and idx + 1 < len(parts):

92 # If there are two segments after 'catalogue', the last one is likely the dataset

93 if idx + 2 < len(parts):

94 return parts[idx + 2]

95 return parts[idx + 1]

96 return None

99def udc_version_meta(context: Context, data_dict: Dict[str, Any]) -> Dict[str, Any]:

100 """Return title/description for a dataset-like URL.

101

102 For CUDC catalogue URLs, resolve via CKAN's package_show.

103 For other URLs, perform a lightweight HTML scrape.

104 """

105

106 url = (data_dict.get("url") or "").strip()

107 if not url:

108 raise logic.ValidationError({"url": [_("Missing url")]})

109

110 # Optional: require logged-in user

111 user = context.get("user")

112 if not user:

113 raise logic.NotAuthorized(_("You must be logged in to use this action"))

114

115 # 1) Try to resolve CUDC catalogue entries via package_show

116 result: Dict[str, Any] = {}

117 if "/catalogue/" in url:

118 name = _extract_cudc_dataset_name(url)

119 if name:

120 try:

121 pkg = logic.get_action("package_show")(context, {"id": name})

122 title = pkg.get("title") or pkg.get("name") or ""

123 desc = pkg.get("summary") or pkg.get("notes") or ""

124 if title:

125 result["title"] = title

126 if desc:

127 result["description"] = desc

128 except logic.NotFound:

129 log.info("udc_version_meta: package %s not found for url %s", name, url)

130 except Exception as e:

131 log.warning(

132 "udc_version_meta: error resolving package %s for url %s: %s",

133 name,

134 url,

135 e,

136 )

137

138 # 2) If nothing yet, and allowed by config, scrape the URL

139 if not result:

140 allow_scrape = config.get("udc.version_meta.scrape", "true").lower() in (

141 "true",

142 "1",

143 "yes",

144 )

145 if allow_scrape:

146 scraped = _scrape_html_title_description(url)

147 if scraped:

148 result.update(scraped)

149

150 return result