Coverage for ckanext/udc/desc/actions.py: 18%
105 statements
« prev ^ index » next coverage.py v7.7.1, created at 2026-01-19 23:48 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2026-01-19 23:48 +0000
1import ckan.authz as authz
2from ckan.types import Context
3import ckan.logic as logic
4from ckan.types import Context
5from ckan.common import _
8from typing import List, Dict, cast
10import logging
11from openai import OpenAI
14from .cleaning import (
15 extract_display_name,
16 convert_non_str_nan,
17 covert_datetime,
18)
19from .utils import gen_mapping, get_config, get_package
21logger = logging.getLogger(__name__)
24default_prompt = (
25 f'Reset previous sessions, do not hallucinate, and create a two paragraph summary for '
26 f"this catalogue's fields values, ensuring uniqueness and distinct meanings for each field. Do not summarize "
27 f'the names of the fields available but describe their values.'
28)
30default_config = {
31 "openai_key": "",
32 "openai_model": "gpt-4",
33 "max_tokens": 500,
34 "temperature": 0.0,
35 "use_custom_prompt": False,
36 "custom_prompt": default_prompt,
37 "use_markdown": False,
38}
41# Function to generate catalogue summary
42def generate_catalogue_summary(row, mapping, column_to_ignore=None):
43 formatted_text = ""
44 field_desc_text = ""
45 for column, value in row.items():
46 # if column not in mapping:
47 # print(f"Column {column} not found in mapping")
49 if (column not in column_to_ignore) and (column in mapping):
51 # print(column, value)
52 # print(mapping[column]['display_name'], mapping[column]['short_description'])
53 formatted_text += f"{column}: {value}\n"
54 field_desc_text += f"{mapping[column]['display_name']}: {mapping[column]['short_description']}\n"
55 return formatted_text, field_desc_text
58# Function to get catalogue summary from OpenAI
59def get_catalogue_summary_from_openai(row, mapping, config):
60 client = OpenAI(
61 api_key=config["openai_key"],
62 )
64 catalogue_summary, field_desc_text = generate_catalogue_summary(
65 row, mapping, column_to_ignore=["summary"]
66 )
68 prompt = (
69 f'These are the descriptions of the fields: \n```{field_desc_text}```\n And these are the catalogue field values:\n```'
70 f'{catalogue_summary}```\n'
71 )
73 if config.get("use_custom_prompt") and config.get("custom_prompt"):
74 prompt += config.get("custom_prompt")
75 else:
76 prompt += default_prompt
78 if config.get("use_markdown"):
79 prompt += " Markdown is supported and preferred. Please use links and lists where appropriate."
82 res = client.chat.completions.create(
83 model=config["openai_model"],
84 messages=[{"role": "system", "content": prompt}],
85 max_tokens=config["max_tokens"],
86 temperature=config["temperature"],
87 )
88 print(res)
90 return prompt, [choice.message.content for choice in res.choices]
93def summary_generate(context: Context, package_id: str):
94 # Check admin
95 if not authz.is_sysadmin(context.get('user')):
96 raise logic.NotAuthorized("You are not authorized to view this page")
98 config = get_config()
100 # Get a single catalogue entry
101 package = get_package(context, package_id)
103 properties_to_ignore = [
105 "cudc_import_config_id", # udc-import-other-portals internal field
106 "related_packages", # udc-import-other-portals internal field
107 "relationships_as_object", # udc-import-other-portals internal field
108 "relationships_as_subject", # udc-import-other-portals internal field
110 "isopen", # CKAN field, Not used
111 "private", # Private dataset
112 "maintainer_email", # CKAN field, Not used in the maturity model
113 "license_id", # We only care about the license title and url
114 "num_resources", # doesn't matter
115 "num_tags", # doesn't matter
116 "state", # CKAN field, Not used in the maturity model, used for the state of the dataset (active, deleted, etc.)
117 "type", # (catalogue) CKAN field, Not used in the maturity model, used for the type of the dataset (dataset, catalogue, etc.)
118 "id", # not interested in the package id
119 "name", # not interested in the package name
120 "summary", # Previous summary
122 ]
123 metadata = {}
124 for key in package:
125 if key in properties_to_ignore:
126 continue
127 if (
128 package[key] is None
129 or package[key] == ""
130 or (type(package[key]) == list and len(package[key]) == 0)
131 ):
132 continue
133 metadata[key] = package[key]
135 # Data cleaning and transformation
137 # Get all resources names
138 resources_name = []
139 for resource in metadata.get("resources", []):
140 if resource.get("name"):
141 resources_name.append(resource.get("name"))
142 metadata["resources"] = repr(resources_name)
144 # Get organization name
145 metadata["organization"] = metadata.get("organization", {}).get("title")
146 del metadata["owner_org"]
148 metadata["tags"] = extract_display_name(metadata.get("tags", []))
150 # License
151 metadata["license"] = metadata.get("license_title", "")
152 if "license_title" in metadata:
153 del metadata["license_title"]
154 if metadata.get("license_url"):
155 metadata["license"] += f" ({metadata['license_url']})"
156 del metadata["license_url"]
158 # Convert datetime to date
159 metadata["metadata_created"] = covert_datetime(metadata.get("metadata_created"))
160 metadata["metadata_modified"] = covert_datetime(metadata.get("metadata_modified"))
162 to_suppress_if_found = ["location", "description_document", "url"]
163 for key in metadata.keys():
164 if key in to_suppress_if_found:
165 metadata[key] = convert_non_str_nan(
166 metadata[key],
167 nan_value="Not provided",
168 if_found_value="Is provided",
169 )
172 # Maturity Model Mapping and Renamings
173 mapping = gen_mapping(config["maturity_model"])
175 # Additional description for the ckan fields
176 mapping["Organization"] = {
177 "internal_name": "organization",
178 "display_name": "Organization",
179 "short_description": "The dataset's owning organization in CUDC",
180 }
182 # CKAN uses `notes` for the description field
183 mapping["Description"] = {
184 "internal_name": "notes",
185 "display_name": "Description",
186 "short_description": mapping.get("description", {}).get("short_description", "The description of the dataset."),
187 }
189 # CKAN uses `url` for the source field
190 mapping["Source"] = {
191 "internal_name": "url",
192 "display_name": "Source",
193 "short_description": mapping.get("source", {}).get("short_description", "The source of the dataset."),
194 }
196 # License
197 mapping["License"] = {
198 "internal_name": "license",
199 "display_name": "License",
200 "short_description": mapping.get("license_id", {}).get("short_description", "License used to access the dataset."),
201 }
202 if "license_id" in mapping:
203 del mapping["license_id"]
205 # Resources
206 mapping["Resources"] = {
207 "internal_name": "resources",
208 "display_name": "Resources",
209 "short_description": "The resources available in the dataset. (In Python repr() format)",
210 }
212 # Metadata created/modified
213 mapping["Metadata Created"] = {
214 "internal_name": "metadata_created",
215 "display_name": "Metadata Created",
216 "short_description": "The date and time the metadata was created.",
217 }
218 mapping["Metadata Modified"] = {
219 "internal_name": "metadata_modified",
220 "display_name": "Metadata Modified",
221 "short_description": "The date and time the metadata was last modified.",
222 }
224 for field in mapping.values():
225 if field["internal_name"] in metadata:
226 metadata[field["display_name"]] = metadata.pop(field["internal_name"])
228 try:
229 prompt, results = get_catalogue_summary_from_openai(metadata, mapping, config)
231 return {"prompt": prompt, "results": results}
232 # return {"prompt": "", "results": []}
234 except Exception as e:
235 raise logic.ActionError(
236 _("\nError while generating summary using OpenAI. Exited with error: ") + str(e)
237 )
239def update_summary(context: Context, data: dict):
240 # Check admin
241 if not authz.is_sysadmin(context.get('user')):
242 raise logic.NotAuthorized(_("You are not authorized to view this page"))
244 package_id = data.get("package_id")
245 summary = data.get("summary")
247 if not package_id:
248 raise logic.ValidationError(_("package_id is required"))
249 if not summary:
250 raise logic.ValidationError(_("summary is required"))
252 package = get_package(context, package_id)
254 if not package:
255 raise logic.ValidationError(_("Package not found"))
257 package["summary"] = summary
259 logic.check_access("package_update", context, data_dict=package)
260 # print(action, existing_package)
262 logic.get_action("package_update")(context, package)
264 return {"package_id": package_id, "summary": summary}
267@logic.side_effect_free
268def default_ai_summary_config(context: Context, data: dict):
269 # Check admin
270 if not authz.is_sysadmin(context.get('user')):
271 raise logic.NotAuthorized(_("You are not authorized to view this page"))
273 return default_config