Coverage for ckanext/udc/version/actions.py: 17%

78 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2026-01-19 23:48 +0000

1from __future__ import annotations 

2 

3import logging 

4from typing import Any, Dict 

5 

6from bs4 import BeautifulSoup # type: ignore 

7 

8import ckan.logic as logic 

9from ckan.types import Context 

10from ckan.common import _, config 

11from ckan.lib.navl.dictization_functions import DataError 

12 

13import requests 

14 

15log = logging.getLogger(__name__) 

16 

17 

18def _scrape_html_title_description(url: str) -> Dict[str, str]: 

19 """Best-effort scrape of <title> and a description from an HTML page. 

20 

21 This is intentionally simple and defensive: it fetches the URL with a 

22 short timeout, parses HTML with BeautifulSoup if available, and tries 

23 to extract: 

24 

25 - <title> 

26 - <meta name="description"> or <meta property="og:description"> 

27 

28 Any errors result in an empty dict. 

29 """ 

30 headers = { 

31 "User-Agent": config.get( 

32 "udc.version_meta.user_agent", 

33 "CKAN-UDC-VersionMeta/1.0 (+https://ckan.org)", 

34 ) 

35 } 

36 try: 

37 resp = requests.get(url, headers=headers, timeout=5) 

38 resp.raise_for_status() 

39 except Exception as e: 

40 log.warning("udc_version_meta: error fetching %s: %s", url, e) 

41 return {} 

42 

43 content_type = resp.headers.get("Content-Type", "") 

44 if "html" not in content_type.lower(): 

45 return {} 

46 

47 try: 

48 soup = BeautifulSoup(resp.text, "html.parser") 

49 except Exception as e: # pragma: no cover - extremely unlikely 

50 log.warning("udc_version_meta: error parsing HTML from %s: %s", url, e) 

51 return {} 

52 

53 title = "" 

54 desc = "" 

55 

56 if soup.title and soup.title.string: 

57 title = soup.title.string.strip() 

58 

59 meta_desc = soup.find("meta", attrs={"name": "description"}) or soup.find( 

60 "meta", attrs={"property": "og:description"} 

61 ) 

62 if meta_desc: 

63 desc = meta_desc.get("content") or "" 

64 desc = desc.strip() 

65 

66 out: Dict[str, str] = {} 

67 if title: 

68 out["title"] = title 

69 if desc: 

70 out["description"] = desc 

71 return out 

72 

73 

74def _extract_cudc_dataset_name(url: str) -> str | None: 

75 """Extract dataset name from a CUDC catalogue URL if possible. 

76 

77 Expected forms include: 

78 - /catalogue/<name> 

79 - /catalogue/<type>/<name> 

80 """ 

81 

82 try: 

83 from urllib.parse import urlparse 

84 except Exception: # pragma: no cover 

85 return None 

86 

87 parsed = urlparse(url) 

88 parts = [p for p in parsed.path.split("/") if p] 

89 # find 'catalogue' segment and take the following as name (or the next) 

90 for idx, part in enumerate(parts): 

91 if part == "catalogue" and idx + 1 < len(parts): 

92 # If there are two segments after 'catalogue', the last one is likely the dataset 

93 if idx + 2 < len(parts): 

94 return parts[idx + 2] 

95 return parts[idx + 1] 

96 return None 

97 

98 

99def udc_version_meta(context: Context, data_dict: Dict[str, Any]) -> Dict[str, Any]: 

100 """Return title/description for a dataset-like URL. 

101 

102 For CUDC catalogue URLs, resolve via CKAN's package_show. 

103 For other URLs, perform a lightweight HTML scrape. 

104 """ 

105 

106 url = (data_dict.get("url") or "").strip() 

107 if not url: 

108 raise logic.ValidationError({"url": [_("Missing url")]}) 

109 

110 # Optional: require logged-in user 

111 user = context.get("user") 

112 if not user: 

113 raise logic.NotAuthorized(_("You must be logged in to use this action")) 

114 

115 # 1) Try to resolve CUDC catalogue entries via package_show 

116 result: Dict[str, Any] = {} 

117 if "/catalogue/" in url: 

118 name = _extract_cudc_dataset_name(url) 

119 if name: 

120 try: 

121 pkg = logic.get_action("package_show")(context, {"id": name}) 

122 title = pkg.get("title") or pkg.get("name") or "" 

123 desc = pkg.get("summary") or pkg.get("notes") or "" 

124 if title: 

125 result["title"] = title 

126 if desc: 

127 result["description"] = desc 

128 except logic.NotFound: 

129 log.info("udc_version_meta: package %s not found for url %s", name, url) 

130 except Exception as e: 

131 log.warning( 

132 "udc_version_meta: error resolving package %s for url %s: %s", 

133 name, 

134 url, 

135 e, 

136 ) 

137 

138 # 2) If nothing yet, and allowed by config, scrape the URL 

139 if not result: 

140 allow_scrape = config.get("udc.version_meta.scrape", "true").lower() in ( 

141 "true", 

142 "1", 

143 "yes", 

144 ) 

145 if allow_scrape: 

146 scraped = _scrape_html_title_description(url) 

147 if scraped: 

148 result.update(scraped) 

149 

150 return result