Coverage for ckanext/udc/solr/solr.py: 90%
59 statements
« prev ^ index » next coverage.py v7.7.1, created at 2026-01-19 23:48 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2026-01-19 23:48 +0000
1"""
2Please note that applying this solr schema changes will require a reindex of the dataset.
3Probably also rebooting the solr server after the changes then reindex.
5Available Field Types: (Also see <solr_url>/schema/fieldtypes)
6----------------------
7- string:
8 - Exact match only, no tokenization or analysis.
9 - Use for IDs, keywords, or non-searchable fields.
10 - Used by license (id), tags, organization, title_string, url, version
11 - Used for retrieving facets.
13- text
14 - Natural language search (e.g., descriptions, long-form text)
16- text_general:
17 - Tokenized text with lowercase filter and word splitting.
18 - Good for full-text search.
19 - General text search without stemming (e.g., titles, keywords)
21- text_ngram:
22 - Tokenized text with lowercase filter, n-gram tokenization.
23 - Good for Autocomplete, Fuzzy Matching, Partial-Word Search
24 - Used by name_ngram, title_ngram.
26- boolean:
27 - Stores `true` or `false` values.
29- pint:
30 - 32-bit signed integer (numeric).
32- plong:
33 - 64-bit signed integer (for large numbers).
35- pfloat:
36 - Single-precision floating point number.
38- pdouble:
39 - Double-precision floating point number.
41- date:
42 - ISO 8601 format (`YYYY-MM-DDThh:mm:ssZ`).
43 - Supports range queries.
45"""
47from __future__ import annotations
48import logging
50from .helpers import get_fields, get_extras_fields, delete_extras_fields, add_copy_field, delete_copy_field, add_field, delete_field, ensure_language_dynamic_fields
51from .config import get_udc_langs
53log = logging.getLogger(__name__)
56def update_solr_maturity_model_fields(maturity_model: list):
57 """
58 Update Solr schema to include fields needed by the maturity model AND
59 multilingual search/facets.
61 Text fields in the maturity model are now multilingual JSON, so we DO NOT
62 create 'extras_<name>' text fields anymore. Instead, the indexer writes to:
63 <name>_<lang>_txt (search)
64 <name>_<lang>_f (facets)
66 Non-text types (date/number/select) still use 'extras_<name>'.
67 """
68 # 1) Ensure dynamic language fields
69 langs = get_udc_langs()
70 ensure_language_dynamic_fields(langs)
72 # 2) Build the static 'extras_*' fields for NON-text types only
73 new_fields = {}
74 for level in maturity_model:
75 for field in level.get("fields", []):
76 if "ckanField" in field:
77 continue
79 ftype = field.get("type")
80 name = field["name"]
81 key = f"extras_{name}"
83 # TEXT -> multilingual JSON now; do not add 'extras_<name>'
84 if ftype is None or ftype == "text":
85 continue
87 elif ftype in ("date", "datetime", "time"):
88 new_fields[key] = {
89 "name": key,
90 "type": "date",
91 "multiValued": False,
92 "indexed": True,
93 "stored": True,
94 "docValues": True,
95 }
97 elif ftype == "number":
98 new_fields[key] = {
99 "name": key,
100 "type": "pfloat",
101 "multiValued": False,
102 "indexed": True,
103 "stored": True,
104 "docValues": True,
105 }
107 elif ftype in ("multiple_select", "multiple_datasets"):
108 new_fields[key] = {
109 "name": key,
110 "type": "string",
111 "multiValued": True,
112 "stored": True,
113 "indexed": True,
114 }
116 elif ftype in ("single_select", "single_dataset"):
117 new_fields[key] = {
118 "name": key,
119 "type": "string",
120 "multiValued": False,
121 "indexed": True,
122 "stored": True,
123 }
125 else:
126 raise ValueError(f"Unknown field type: {ftype}")
128 # 3) Reconcile against existing 'extras_*' fields in Solr
129 current_fields = get_extras_fields()
131 # delete/replace changed or obsolete fields
132 for current_field_name, current_field in current_fields.items():
133 if current_field_name not in new_fields:
134 # If we used to index a text field here, drop it now (we're multilingual)
135 delete_field(current_field_name)
136 else:
137 desired = new_fields[current_field_name]
138 if (
139 desired["type"] != current_field["type"]
140 or desired["indexed"] != current_field["indexed"]
141 or desired["stored"] != current_field["stored"]
142 or desired["multiValued"] != current_field.get("multiValued", False)
143 or desired.get("docValues", False) != current_field.get("docValues", False)
144 ):
145 delete_field(current_field_name)
146 add_field(
147 desired["name"],
148 desired["type"],
149 desired["indexed"],
150 desired["stored"],
151 desired["multiValued"],
152 desired.get("docValues", False),
153 )
154 # remove from "to add"
155 new_fields.pop(current_field_name)
157 # add remaining new fields
158 for _, f in new_fields.items():
159 add_field(
160 f["name"],
161 f["type"],
162 f["indexed"],
163 f["stored"],
164 f["multiValued"],
165 f.get("docValues", False),
166 )
168 if not new_fields:
169 log.info("No new 'extras_*' fields to add.")
170 else:
171 log.info(f"Added {len(new_fields)} new 'extras_*' fields.")
173 # 4) Keep tags partial-search helper
174 all_fields = get_fields()
175 if "tags_ngram" not in all_fields:
176 add_field("tags_ngram", "text_ngram", indexed=True, stored=True, multi_valued=True)
177 add_copy_field("tags", "tags_ngram")
178 log.info("Added tags_ngram field.")
179 else:
180 log.info("tags_ngram field already exists.")
182 all_fields = get_fields()
184 # 5) Ensure version relationship helper fields exist and are multiValued strings
185 # These are populated by the before_dataset_index hook from JSON version metadata.
186 version_fields = [
187 "version_dataset_url",
188 "version_dataset_title_url",
189 "dataset_versions_url",
190 "dataset_versions_title_url",
191 ]
193 for fname in version_fields:
194 fdef = all_fields.get(fname)
195 if not fdef:
196 add_field(fname, "string", indexed=True, stored=True, multi_valued=True)
197 log.info("Added version field %s as multiValued string", fname)
198 else:
199 needs_update = (
200 fdef.get("type") != "string"
201 or not fdef.get("indexed", False)
202 or not fdef.get("stored", False)
203 or not fdef.get("multiValued", False)
204 )
205 if needs_update:
206 delete_field(fname)
207 add_field(fname, "string", indexed=True, stored=True, multi_valued=True)
208 log.info("Replaced version field %s as multiValued string", fname)