Coverage for ckanext/udc/desc/actions.py: 18%

105 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2026-01-19 23:48 +0000

1import ckan.authz as authz 

2from ckan.types import Context 

3import ckan.logic as logic 

4from ckan.types import Context 

5from ckan.common import _ 

6 

7 

8from typing import List, Dict, cast 

9 

10import logging 

11from openai import OpenAI 

12 

13 

14from .cleaning import ( 

15 extract_display_name, 

16 convert_non_str_nan, 

17 covert_datetime, 

18) 

19from .utils import gen_mapping, get_config, get_package 

20 

21logger = logging.getLogger(__name__) 

22 

23 

24default_prompt = ( 

25 f'Reset previous sessions, do not hallucinate, and create a two paragraph summary for ' 

26 f"this catalogue's fields values, ensuring uniqueness and distinct meanings for each field. Do not summarize " 

27 f'the names of the fields available but describe their values.' 

28) 

29 

30default_config = { 

31 "openai_key": "", 

32 "openai_model": "gpt-4", 

33 "max_tokens": 500, 

34 "temperature": 0.0, 

35 "use_custom_prompt": False, 

36 "custom_prompt": default_prompt, 

37 "use_markdown": False, 

38} 

39 

40 

41# Function to generate catalogue summary 

42def generate_catalogue_summary(row, mapping, column_to_ignore=None): 

43 formatted_text = "" 

44 field_desc_text = "" 

45 for column, value in row.items(): 

46 # if column not in mapping: 

47 # print(f"Column {column} not found in mapping") 

48 

49 if (column not in column_to_ignore) and (column in mapping): 

50 

51 # print(column, value) 

52 # print(mapping[column]['display_name'], mapping[column]['short_description']) 

53 formatted_text += f"{column}: {value}\n" 

54 field_desc_text += f"{mapping[column]['display_name']}: {mapping[column]['short_description']}\n" 

55 return formatted_text, field_desc_text 

56 

57 

58# Function to get catalogue summary from OpenAI 

59def get_catalogue_summary_from_openai(row, mapping, config): 

60 client = OpenAI( 

61 api_key=config["openai_key"], 

62 ) 

63 

64 catalogue_summary, field_desc_text = generate_catalogue_summary( 

65 row, mapping, column_to_ignore=["summary"] 

66 ) 

67 

68 prompt = ( 

69 f'These are the descriptions of the fields: \n```{field_desc_text}```\n And these are the catalogue field values:\n```' 

70 f'{catalogue_summary}```\n' 

71 ) 

72 

73 if config.get("use_custom_prompt") and config.get("custom_prompt"): 

74 prompt += config.get("custom_prompt") 

75 else: 

76 prompt += default_prompt 

77 

78 if config.get("use_markdown"): 

79 prompt += " Markdown is supported and preferred. Please use links and lists where appropriate." 

80 

81 

82 res = client.chat.completions.create( 

83 model=config["openai_model"], 

84 messages=[{"role": "system", "content": prompt}], 

85 max_tokens=config["max_tokens"], 

86 temperature=config["temperature"], 

87 ) 

88 print(res) 

89 

90 return prompt, [choice.message.content for choice in res.choices] 

91 

92 

93def summary_generate(context: Context, package_id: str): 

94 # Check admin 

95 if not authz.is_sysadmin(context.get('user')): 

96 raise logic.NotAuthorized("You are not authorized to view this page") 

97 

98 config = get_config() 

99 

100 # Get a single catalogue entry 

101 package = get_package(context, package_id) 

102 

103 properties_to_ignore = [ 

104 

105 "cudc_import_config_id", # udc-import-other-portals internal field 

106 "related_packages", # udc-import-other-portals internal field 

107 "relationships_as_object", # udc-import-other-portals internal field 

108 "relationships_as_subject", # udc-import-other-portals internal field 

109 

110 "isopen", # CKAN field, Not used 

111 "private", # Private dataset 

112 "maintainer_email", # CKAN field, Not used in the maturity model 

113 "license_id", # We only care about the license title and url 

114 "num_resources", # doesn't matter 

115 "num_tags", # doesn't matter 

116 "state", # CKAN field, Not used in the maturity model, used for the state of the dataset (active, deleted, etc.) 

117 "type", # (catalogue) CKAN field, Not used in the maturity model, used for the type of the dataset (dataset, catalogue, etc.) 

118 "id", # not interested in the package id 

119 "name", # not interested in the package name 

120 "summary", # Previous summary 

121 

122 ] 

123 metadata = {} 

124 for key in package: 

125 if key in properties_to_ignore: 

126 continue 

127 if ( 

128 package[key] is None 

129 or package[key] == "" 

130 or (type(package[key]) == list and len(package[key]) == 0) 

131 ): 

132 continue 

133 metadata[key] = package[key] 

134 

135 # Data cleaning and transformation 

136 

137 # Get all resources names 

138 resources_name = [] 

139 for resource in metadata.get("resources", []): 

140 if resource.get("name"): 

141 resources_name.append(resource.get("name")) 

142 metadata["resources"] = repr(resources_name) 

143 

144 # Get organization name 

145 metadata["organization"] = metadata.get("organization", {}).get("title") 

146 del metadata["owner_org"] 

147 

148 metadata["tags"] = extract_display_name(metadata.get("tags", [])) 

149 

150 # License 

151 metadata["license"] = metadata.get("license_title", "") 

152 if "license_title" in metadata: 

153 del metadata["license_title"] 

154 if metadata.get("license_url"): 

155 metadata["license"] += f" ({metadata['license_url']})" 

156 del metadata["license_url"] 

157 

158 # Convert datetime to date 

159 metadata["metadata_created"] = covert_datetime(metadata.get("metadata_created")) 

160 metadata["metadata_modified"] = covert_datetime(metadata.get("metadata_modified")) 

161 

162 to_suppress_if_found = ["location", "description_document", "url"] 

163 for key in metadata.keys(): 

164 if key in to_suppress_if_found: 

165 metadata[key] = convert_non_str_nan( 

166 metadata[key], 

167 nan_value="Not provided", 

168 if_found_value="Is provided", 

169 ) 

170 

171 

172 # Maturity Model Mapping and Renamings 

173 mapping = gen_mapping(config["maturity_model"]) 

174 

175 # Additional description for the ckan fields 

176 mapping["Organization"] = { 

177 "internal_name": "organization", 

178 "display_name": "Organization", 

179 "short_description": "The dataset's owning organization in CUDC", 

180 } 

181 

182 # CKAN uses `notes` for the description field 

183 mapping["Description"] = { 

184 "internal_name": "notes", 

185 "display_name": "Description", 

186 "short_description": mapping.get("description", {}).get("short_description", "The description of the dataset."), 

187 } 

188 

189 # CKAN uses `url` for the source field 

190 mapping["Source"] = { 

191 "internal_name": "url", 

192 "display_name": "Source", 

193 "short_description": mapping.get("source", {}).get("short_description", "The source of the dataset."), 

194 } 

195 

196 # License 

197 mapping["License"] = { 

198 "internal_name": "license", 

199 "display_name": "License", 

200 "short_description": mapping.get("license_id", {}).get("short_description", "License used to access the dataset."), 

201 } 

202 if "license_id" in mapping: 

203 del mapping["license_id"] 

204 

205 # Resources 

206 mapping["Resources"] = { 

207 "internal_name": "resources", 

208 "display_name": "Resources", 

209 "short_description": "The resources available in the dataset. (In Python repr() format)", 

210 } 

211 

212 # Metadata created/modified 

213 mapping["Metadata Created"] = { 

214 "internal_name": "metadata_created", 

215 "display_name": "Metadata Created", 

216 "short_description": "The date and time the metadata was created.", 

217 } 

218 mapping["Metadata Modified"] = { 

219 "internal_name": "metadata_modified", 

220 "display_name": "Metadata Modified", 

221 "short_description": "The date and time the metadata was last modified.", 

222 } 

223 

224 for field in mapping.values(): 

225 if field["internal_name"] in metadata: 

226 metadata[field["display_name"]] = metadata.pop(field["internal_name"]) 

227 

228 try: 

229 prompt, results = get_catalogue_summary_from_openai(metadata, mapping, config) 

230 

231 return {"prompt": prompt, "results": results} 

232 # return {"prompt": "", "results": []} 

233 

234 except Exception as e: 

235 raise logic.ActionError( 

236 _("\nError while generating summary using OpenAI. Exited with error: ") + str(e) 

237 ) 

238 

239def update_summary(context: Context, data: dict): 

240 # Check admin 

241 if not authz.is_sysadmin(context.get('user')): 

242 raise logic.NotAuthorized(_("You are not authorized to view this page")) 

243 

244 package_id = data.get("package_id") 

245 summary = data.get("summary") 

246 

247 if not package_id: 

248 raise logic.ValidationError(_("package_id is required")) 

249 if not summary: 

250 raise logic.ValidationError(_("summary is required")) 

251 

252 package = get_package(context, package_id) 

253 

254 if not package: 

255 raise logic.ValidationError(_("Package not found")) 

256 

257 package["summary"] = summary 

258 

259 logic.check_access("package_update", context, data_dict=package) 

260 # print(action, existing_package) 

261 

262 logic.get_action("package_update")(context, package) 

263 

264 return {"package_id": package_id, "summary": summary} 

265 

266 

267@logic.side_effect_free 

268def default_ai_summary_config(context: Context, data: dict): 

269 # Check admin 

270 if not authz.is_sysadmin(context.get('user')): 

271 raise logic.NotAuthorized(_("You are not authorized to view this page")) 

272 

273 return default_config