Coverage for ckanext/udc/solr/solr.py: 87%

99 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2026-03-30 22:15 +0000

1""" 

2Please note that applying this solr schema changes will require a reindex of the dataset. 

3Probably also rebooting the solr server after the changes then reindex. 

4 

5Available Field Types: (Also see <solr_url>/schema/fieldtypes) 

6---------------------- 

7- string: 

8 - Exact match only, no tokenization or analysis. 

9 - Use for IDs, keywords, or non-searchable fields. 

10 - Used by license (id), tags, organization, title_string, url, version 

11 - Used for retrieving facets.  

12 

13- text 

14 - Natural language search (e.g., descriptions, long-form text) 

15 

16- text_general: 

17 - Tokenized text with lowercase filter and word splitting. 

18 - Good for full-text search. 

19 - General text search without stemming (e.g., titles, keywords) 

20 

21- text_ngram: 

22 - Tokenized text with lowercase filter, n-gram tokenization. 

23 - Good for Autocomplete, Fuzzy Matching, Partial-Word Search 

24 - Used by name_ngram, title_ngram. 

25 

26- boolean: 

27 - Stores `true` or `false` values. 

28 

29- pint: 

30 - 32-bit signed integer (numeric). 

31 

32- plong: 

33 - 64-bit signed integer (for large numbers). 

34 

35- pfloat: 

36 - Single-precision floating point number. 

37 

38- pdouble: 

39 - Double-precision floating point number. 

40 

41- date: 

42 - ISO 8601 format (`YYYY-MM-DDThh:mm:ssZ`). 

43 - Supports range queries. 

44 

45""" 

46 

47from __future__ import annotations 

48import logging 

49 

50from .helpers import get_fields, get_extras_fields, delete_extras_fields, add_copy_field, delete_copy_field, add_field, delete_field, ensure_language_dynamic_fields 

51from .config import get_udc_langs 

52 

53log = logging.getLogger(__name__) 

54 

55 

56def _resolve_extras_field_name(field: dict) -> tuple[str | None, str | None]: 

57 ckan_field = field.get("ckanField") 

58 if ckan_field and ckan_field != "portal_type": 

59 return None, ckan_field 

60 if ckan_field == "portal_type": 

61 return "portal_type", ckan_field 

62 return field.get("name"), ckan_field 

63 

64 

65def _build_extras_field_definition(key: str, ftype: str, ckan_field: str | None) -> dict[str, object] | None: 

66 if ftype is None or ftype == "text": 

67 return None 

68 

69 if ftype in ("date", "datetime", "time"): 

70 return { 

71 "name": key, 

72 "type": "date", 

73 "multiValued": False, 

74 "indexed": True, 

75 "stored": True, 

76 "docValues": True, 

77 } 

78 

79 if ftype == "number": 

80 return { 

81 "name": key, 

82 "type": "pfloat", 

83 "multiValued": False, 

84 "indexed": True, 

85 "stored": True, 

86 "docValues": True, 

87 } 

88 

89 if ftype in ("multiple_select", "multiple_datasets"): 

90 return { 

91 "name": key, 

92 "type": "string", 

93 "multiValued": True, 

94 "stored": True, 

95 "indexed": True, 

96 } 

97 

98 if ftype in ("single_select", "single_dataset"): 

99 return { 

100 "name": key, 

101 "type": "string", 

102 "multiValued": ckan_field == "portal_type", 

103 "indexed": True, 

104 "stored": True, 

105 } 

106 

107 raise ValueError(f"Unknown field type: {ftype}") 

108 

109 

110def update_solr_maturity_model_fields(maturity_model: list): 

111 """ 

112 Update Solr schema to include fields needed by the maturity model AND 

113 multilingual search/facets. 

114 

115 Text fields in the maturity model are now multilingual JSON, so we DO NOT 

116 create 'extras_<name>' text fields anymore. Instead, the indexer writes to: 

117 <name>_<lang>_txt (search) 

118 <name>_<lang>_f (facets) 

119 

120 Non-text types (date/number/select) still use 'extras_<name>'. 

121 """ 

122 # 1) Ensure dynamic language fields 

123 langs = get_udc_langs() 

124 ensure_language_dynamic_fields(langs) 

125 

126 # 2) Build the static 'extras_*' fields for NON-text types only 

127 new_fields = {} 

128 managed_special_fields: set[str] = set() 

129 special_field_status: dict[str, str] = {} 

130 replaced_fields: list[str] = [] 

131 deleted_fields: list[str] = [] 

132 unchanged_fields: list[str] = [] 

133 for level in maturity_model: 

134 for field in level.get("fields", []): 

135 name, ckan_field = _resolve_extras_field_name(field) 

136 if not name: 

137 continue 

138 

139 ftype = field.get("type") 

140 key = f"extras_{name}" 

141 if ckan_field == "portal_type": 

142 managed_special_fields.add(key) 

143 

144 field_definition = _build_extras_field_definition(key, ftype, ckan_field) 

145 if field_definition is None: 

146 continue 

147 new_fields[key] = field_definition 

148 

149 log.info( 

150 "Config-derived explicit extras fields (%s): %s", 

151 len(new_fields), 

152 ", ".join(sorted(new_fields)), 

153 ) 

154 

155 # 3) Reconcile against existing 'extras_*' fields in Solr 

156 current_fields = get_extras_fields() 

157 

158 # delete/replace changed or obsolete fields 

159 for current_field_name, current_field in current_fields.items(): 

160 if current_field_name not in new_fields: 

161 # If we used to index a text field here, drop it now (we're multilingual) 

162 delete_field(current_field_name) 

163 deleted_fields.append(current_field_name) 

164 else: 

165 desired = new_fields[current_field_name] 

166 if ( 

167 desired["type"] != current_field["type"] 

168 or desired["indexed"] != current_field["indexed"] 

169 or desired["stored"] != current_field["stored"] 

170 or desired["multiValued"] != current_field.get("multiValued", False) 

171 or desired.get("docValues", False) != current_field.get("docValues", False) 

172 ): 

173 delete_field(current_field_name) 

174 add_field( 

175 desired["name"], 

176 desired["type"], 

177 desired["indexed"], 

178 desired["stored"], 

179 desired["multiValued"], 

180 desired.get("docValues", False), 

181 ) 

182 replaced_fields.append(current_field_name) 

183 if current_field_name in managed_special_fields: 

184 special_field_status[current_field_name] = "replaced" 

185 else: 

186 unchanged_fields.append(current_field_name) 

187 if current_field_name in managed_special_fields: 

188 special_field_status[current_field_name] = "unchanged" 

189 # remove from "to add" 

190 new_fields.pop(current_field_name) 

191 

192 # add remaining new fields 

193 pending_new_fields = dict(new_fields) 

194 for _, f in pending_new_fields.items(): 

195 add_field( 

196 f["name"], 

197 f["type"], 

198 f["indexed"], 

199 f["stored"], 

200 f["multiValued"], 

201 f.get("docValues", False), 

202 ) 

203 if f["name"] in managed_special_fields: 

204 special_field_status[f["name"]] = "added" 

205 

206 if not pending_new_fields: 

207 log.info("No new 'extras_*' fields to add.") 

208 else: 

209 log.info(f"Added {len(pending_new_fields)} new 'extras_*' fields.") 

210 

211 log.info( 

212 "extras schema reconcile summary: desired=%s current=%s added=%s replaced=%s deleted=%s unchanged=%s", 

213 len(new_fields) + len(replaced_fields) + len(unchanged_fields), 

214 len(current_fields), 

215 len(pending_new_fields), 

216 len(replaced_fields), 

217 len(deleted_fields), 

218 len(unchanged_fields), 

219 ) 

220 

221 if pending_new_fields: 

222 log.info("Added extras fields: %s", ", ".join(sorted(pending_new_fields))) 

223 if replaced_fields: 

224 log.info("Replaced extras fields: %s", ", ".join(sorted(replaced_fields))) 

225 if deleted_fields: 

226 log.info("Deleted obsolete extras fields: %s", ", ".join(sorted(deleted_fields))) 

227 

228 for special_field in sorted(managed_special_fields): 

229 status = special_field_status.get(special_field, "missing-from-config") 

230 log.info("Special CKAN facet field %s schema status: %s", special_field, status) 

231 

232 # 4) Keep tags partial-search helper 

233 all_fields = get_fields() 

234 if "tags_ngram" not in all_fields: 

235 add_field("tags_ngram", "text_ngram", indexed=True, stored=True, multi_valued=True) 

236 add_copy_field("tags", "tags_ngram") 

237 log.info("Added tags_ngram field.") 

238 else: 

239 log.info("tags_ngram field already exists.") 

240 

241 all_fields = get_fields() 

242 

243 # 5) Ensure version relationship helper fields exist and are multiValued strings 

244 # These are populated by the before_dataset_index hook from JSON version metadata. 

245 version_fields = [ 

246 "version_dataset_url", 

247 "version_dataset_title_url", 

248 "dataset_versions_url", 

249 "dataset_versions_title_url", 

250 ] 

251 

252 for fname in version_fields: 

253 fdef = all_fields.get(fname) 

254 if not fdef: 

255 add_field(fname, "string", indexed=True, stored=True, multi_valued=True) 

256 log.info("Added version field %s as multiValued string", fname) 

257 else: 

258 needs_update = ( 

259 fdef.get("type") != "string" 

260 or not fdef.get("indexed", False) 

261 or not fdef.get("stored", False) 

262 or not fdef.get("multiValued", False) 

263 ) 

264 if needs_update: 

265 delete_field(fname) 

266 add_field(fname, "string", indexed=True, stored=True, multi_valued=True) 

267 log.info("Replaced version field %s as multiValued string", fname)