Coverage for ckanext/udc/solr/solr.py: 87%
99 statements
« prev ^ index » next coverage.py v7.7.1, created at 2026-03-30 22:15 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2026-03-30 22:15 +0000
1"""
2Please note that applying this solr schema changes will require a reindex of the dataset.
3Probably also rebooting the solr server after the changes then reindex.
5Available Field Types: (Also see <solr_url>/schema/fieldtypes)
6----------------------
7- string:
8 - Exact match only, no tokenization or analysis.
9 - Use for IDs, keywords, or non-searchable fields.
10 - Used by license (id), tags, organization, title_string, url, version
11 - Used for retrieving facets.
13- text
14 - Natural language search (e.g., descriptions, long-form text)
16- text_general:
17 - Tokenized text with lowercase filter and word splitting.
18 - Good for full-text search.
19 - General text search without stemming (e.g., titles, keywords)
21- text_ngram:
22 - Tokenized text with lowercase filter, n-gram tokenization.
23 - Good for Autocomplete, Fuzzy Matching, Partial-Word Search
24 - Used by name_ngram, title_ngram.
26- boolean:
27 - Stores `true` or `false` values.
29- pint:
30 - 32-bit signed integer (numeric).
32- plong:
33 - 64-bit signed integer (for large numbers).
35- pfloat:
36 - Single-precision floating point number.
38- pdouble:
39 - Double-precision floating point number.
41- date:
42 - ISO 8601 format (`YYYY-MM-DDThh:mm:ssZ`).
43 - Supports range queries.
45"""
47from __future__ import annotations
48import logging
50from .helpers import get_fields, get_extras_fields, delete_extras_fields, add_copy_field, delete_copy_field, add_field, delete_field, ensure_language_dynamic_fields
51from .config import get_udc_langs
53log = logging.getLogger(__name__)
56def _resolve_extras_field_name(field: dict) -> tuple[str | None, str | None]:
57 ckan_field = field.get("ckanField")
58 if ckan_field and ckan_field != "portal_type":
59 return None, ckan_field
60 if ckan_field == "portal_type":
61 return "portal_type", ckan_field
62 return field.get("name"), ckan_field
65def _build_extras_field_definition(key: str, ftype: str, ckan_field: str | None) -> dict[str, object] | None:
66 if ftype is None or ftype == "text":
67 return None
69 if ftype in ("date", "datetime", "time"):
70 return {
71 "name": key,
72 "type": "date",
73 "multiValued": False,
74 "indexed": True,
75 "stored": True,
76 "docValues": True,
77 }
79 if ftype == "number":
80 return {
81 "name": key,
82 "type": "pfloat",
83 "multiValued": False,
84 "indexed": True,
85 "stored": True,
86 "docValues": True,
87 }
89 if ftype in ("multiple_select", "multiple_datasets"):
90 return {
91 "name": key,
92 "type": "string",
93 "multiValued": True,
94 "stored": True,
95 "indexed": True,
96 }
98 if ftype in ("single_select", "single_dataset"):
99 return {
100 "name": key,
101 "type": "string",
102 "multiValued": ckan_field == "portal_type",
103 "indexed": True,
104 "stored": True,
105 }
107 raise ValueError(f"Unknown field type: {ftype}")
110def update_solr_maturity_model_fields(maturity_model: list):
111 """
112 Update Solr schema to include fields needed by the maturity model AND
113 multilingual search/facets.
115 Text fields in the maturity model are now multilingual JSON, so we DO NOT
116 create 'extras_<name>' text fields anymore. Instead, the indexer writes to:
117 <name>_<lang>_txt (search)
118 <name>_<lang>_f (facets)
120 Non-text types (date/number/select) still use 'extras_<name>'.
121 """
122 # 1) Ensure dynamic language fields
123 langs = get_udc_langs()
124 ensure_language_dynamic_fields(langs)
126 # 2) Build the static 'extras_*' fields for NON-text types only
127 new_fields = {}
128 managed_special_fields: set[str] = set()
129 special_field_status: dict[str, str] = {}
130 replaced_fields: list[str] = []
131 deleted_fields: list[str] = []
132 unchanged_fields: list[str] = []
133 for level in maturity_model:
134 for field in level.get("fields", []):
135 name, ckan_field = _resolve_extras_field_name(field)
136 if not name:
137 continue
139 ftype = field.get("type")
140 key = f"extras_{name}"
141 if ckan_field == "portal_type":
142 managed_special_fields.add(key)
144 field_definition = _build_extras_field_definition(key, ftype, ckan_field)
145 if field_definition is None:
146 continue
147 new_fields[key] = field_definition
149 log.info(
150 "Config-derived explicit extras fields (%s): %s",
151 len(new_fields),
152 ", ".join(sorted(new_fields)),
153 )
155 # 3) Reconcile against existing 'extras_*' fields in Solr
156 current_fields = get_extras_fields()
158 # delete/replace changed or obsolete fields
159 for current_field_name, current_field in current_fields.items():
160 if current_field_name not in new_fields:
161 # If we used to index a text field here, drop it now (we're multilingual)
162 delete_field(current_field_name)
163 deleted_fields.append(current_field_name)
164 else:
165 desired = new_fields[current_field_name]
166 if (
167 desired["type"] != current_field["type"]
168 or desired["indexed"] != current_field["indexed"]
169 or desired["stored"] != current_field["stored"]
170 or desired["multiValued"] != current_field.get("multiValued", False)
171 or desired.get("docValues", False) != current_field.get("docValues", False)
172 ):
173 delete_field(current_field_name)
174 add_field(
175 desired["name"],
176 desired["type"],
177 desired["indexed"],
178 desired["stored"],
179 desired["multiValued"],
180 desired.get("docValues", False),
181 )
182 replaced_fields.append(current_field_name)
183 if current_field_name in managed_special_fields:
184 special_field_status[current_field_name] = "replaced"
185 else:
186 unchanged_fields.append(current_field_name)
187 if current_field_name in managed_special_fields:
188 special_field_status[current_field_name] = "unchanged"
189 # remove from "to add"
190 new_fields.pop(current_field_name)
192 # add remaining new fields
193 pending_new_fields = dict(new_fields)
194 for _, f in pending_new_fields.items():
195 add_field(
196 f["name"],
197 f["type"],
198 f["indexed"],
199 f["stored"],
200 f["multiValued"],
201 f.get("docValues", False),
202 )
203 if f["name"] in managed_special_fields:
204 special_field_status[f["name"]] = "added"
206 if not pending_new_fields:
207 log.info("No new 'extras_*' fields to add.")
208 else:
209 log.info(f"Added {len(pending_new_fields)} new 'extras_*' fields.")
211 log.info(
212 "extras schema reconcile summary: desired=%s current=%s added=%s replaced=%s deleted=%s unchanged=%s",
213 len(new_fields) + len(replaced_fields) + len(unchanged_fields),
214 len(current_fields),
215 len(pending_new_fields),
216 len(replaced_fields),
217 len(deleted_fields),
218 len(unchanged_fields),
219 )
221 if pending_new_fields:
222 log.info("Added extras fields: %s", ", ".join(sorted(pending_new_fields)))
223 if replaced_fields:
224 log.info("Replaced extras fields: %s", ", ".join(sorted(replaced_fields)))
225 if deleted_fields:
226 log.info("Deleted obsolete extras fields: %s", ", ".join(sorted(deleted_fields)))
228 for special_field in sorted(managed_special_fields):
229 status = special_field_status.get(special_field, "missing-from-config")
230 log.info("Special CKAN facet field %s schema status: %s", special_field, status)
232 # 4) Keep tags partial-search helper
233 all_fields = get_fields()
234 if "tags_ngram" not in all_fields:
235 add_field("tags_ngram", "text_ngram", indexed=True, stored=True, multi_valued=True)
236 add_copy_field("tags", "tags_ngram")
237 log.info("Added tags_ngram field.")
238 else:
239 log.info("tags_ngram field already exists.")
241 all_fields = get_fields()
243 # 5) Ensure version relationship helper fields exist and are multiValued strings
244 # These are populated by the before_dataset_index hook from JSON version metadata.
245 version_fields = [
246 "version_dataset_url",
247 "version_dataset_title_url",
248 "dataset_versions_url",
249 "dataset_versions_title_url",
250 ]
252 for fname in version_fields:
253 fdef = all_fields.get(fname)
254 if not fdef:
255 add_field(fname, "string", indexed=True, stored=True, multi_valued=True)
256 log.info("Added version field %s as multiValued string", fname)
257 else:
258 needs_update = (
259 fdef.get("type") != "string"
260 or not fdef.get("indexed", False)
261 or not fdef.get("stored", False)
262 or not fdef.get("multiValued", False)
263 )
264 if needs_update:
265 delete_field(fname)
266 add_field(fname, "string", indexed=True, stored=True, multi_valued=True)
267 log.info("Replaced version field %s as multiValued string", fname)