Coverage for ckanext/udc/solr/solr.py: 90%

59 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2026-01-19 23:48 +0000

1""" 

2Please note that applying this solr schema changes will require a reindex of the dataset. 

3Probably also rebooting the solr server after the changes then reindex. 

4 

5Available Field Types: (Also see <solr_url>/schema/fieldtypes) 

6---------------------- 

7- string: 

8 - Exact match only, no tokenization or analysis. 

9 - Use for IDs, keywords, or non-searchable fields. 

10 - Used by license (id), tags, organization, title_string, url, version 

11 - Used for retrieving facets.  

12 

13- text 

14 - Natural language search (e.g., descriptions, long-form text) 

15 

16- text_general: 

17 - Tokenized text with lowercase filter and word splitting. 

18 - Good for full-text search. 

19 - General text search without stemming (e.g., titles, keywords) 

20 

21- text_ngram: 

22 - Tokenized text with lowercase filter, n-gram tokenization. 

23 - Good for Autocomplete, Fuzzy Matching, Partial-Word Search 

24 - Used by name_ngram, title_ngram. 

25 

26- boolean: 

27 - Stores `true` or `false` values. 

28 

29- pint: 

30 - 32-bit signed integer (numeric). 

31 

32- plong: 

33 - 64-bit signed integer (for large numbers). 

34 

35- pfloat: 

36 - Single-precision floating point number. 

37 

38- pdouble: 

39 - Double-precision floating point number. 

40 

41- date: 

42 - ISO 8601 format (`YYYY-MM-DDThh:mm:ssZ`). 

43 - Supports range queries. 

44 

45""" 

46 

47from __future__ import annotations 

48import logging 

49 

50from .helpers import get_fields, get_extras_fields, delete_extras_fields, add_copy_field, delete_copy_field, add_field, delete_field, ensure_language_dynamic_fields 

51from .config import get_udc_langs 

52 

53log = logging.getLogger(__name__) 

54 

55 

56def update_solr_maturity_model_fields(maturity_model: list): 

57 """ 

58 Update Solr schema to include fields needed by the maturity model AND 

59 multilingual search/facets. 

60 

61 Text fields in the maturity model are now multilingual JSON, so we DO NOT 

62 create 'extras_<name>' text fields anymore. Instead, the indexer writes to: 

63 <name>_<lang>_txt (search) 

64 <name>_<lang>_f (facets) 

65 

66 Non-text types (date/number/select) still use 'extras_<name>'. 

67 """ 

68 # 1) Ensure dynamic language fields 

69 langs = get_udc_langs() 

70 ensure_language_dynamic_fields(langs) 

71 

72 # 2) Build the static 'extras_*' fields for NON-text types only 

73 new_fields = {} 

74 for level in maturity_model: 

75 for field in level.get("fields", []): 

76 if "ckanField" in field: 

77 continue 

78 

79 ftype = field.get("type") 

80 name = field["name"] 

81 key = f"extras_{name}" 

82 

83 # TEXT -> multilingual JSON now; do not add 'extras_<name>' 

84 if ftype is None or ftype == "text": 

85 continue 

86 

87 elif ftype in ("date", "datetime", "time"): 

88 new_fields[key] = { 

89 "name": key, 

90 "type": "date", 

91 "multiValued": False, 

92 "indexed": True, 

93 "stored": True, 

94 "docValues": True, 

95 } 

96 

97 elif ftype == "number": 

98 new_fields[key] = { 

99 "name": key, 

100 "type": "pfloat", 

101 "multiValued": False, 

102 "indexed": True, 

103 "stored": True, 

104 "docValues": True, 

105 } 

106 

107 elif ftype in ("multiple_select", "multiple_datasets"): 

108 new_fields[key] = { 

109 "name": key, 

110 "type": "string", 

111 "multiValued": True, 

112 "stored": True, 

113 "indexed": True, 

114 } 

115 

116 elif ftype in ("single_select", "single_dataset"): 

117 new_fields[key] = { 

118 "name": key, 

119 "type": "string", 

120 "multiValued": False, 

121 "indexed": True, 

122 "stored": True, 

123 } 

124 

125 else: 

126 raise ValueError(f"Unknown field type: {ftype}") 

127 

128 # 3) Reconcile against existing 'extras_*' fields in Solr 

129 current_fields = get_extras_fields() 

130 

131 # delete/replace changed or obsolete fields 

132 for current_field_name, current_field in current_fields.items(): 

133 if current_field_name not in new_fields: 

134 # If we used to index a text field here, drop it now (we're multilingual) 

135 delete_field(current_field_name) 

136 else: 

137 desired = new_fields[current_field_name] 

138 if ( 

139 desired["type"] != current_field["type"] 

140 or desired["indexed"] != current_field["indexed"] 

141 or desired["stored"] != current_field["stored"] 

142 or desired["multiValued"] != current_field.get("multiValued", False) 

143 or desired.get("docValues", False) != current_field.get("docValues", False) 

144 ): 

145 delete_field(current_field_name) 

146 add_field( 

147 desired["name"], 

148 desired["type"], 

149 desired["indexed"], 

150 desired["stored"], 

151 desired["multiValued"], 

152 desired.get("docValues", False), 

153 ) 

154 # remove from "to add" 

155 new_fields.pop(current_field_name) 

156 

157 # add remaining new fields 

158 for _, f in new_fields.items(): 

159 add_field( 

160 f["name"], 

161 f["type"], 

162 f["indexed"], 

163 f["stored"], 

164 f["multiValued"], 

165 f.get("docValues", False), 

166 ) 

167 

168 if not new_fields: 

169 log.info("No new 'extras_*' fields to add.") 

170 else: 

171 log.info(f"Added {len(new_fields)} new 'extras_*' fields.") 

172 

173 # 4) Keep tags partial-search helper 

174 all_fields = get_fields() 

175 if "tags_ngram" not in all_fields: 

176 add_field("tags_ngram", "text_ngram", indexed=True, stored=True, multi_valued=True) 

177 add_copy_field("tags", "tags_ngram") 

178 log.info("Added tags_ngram field.") 

179 else: 

180 log.info("tags_ngram field already exists.") 

181 

182 all_fields = get_fields() 

183 

184 # 5) Ensure version relationship helper fields exist and are multiValued strings 

185 # These are populated by the before_dataset_index hook from JSON version metadata. 

186 version_fields = [ 

187 "version_dataset_url", 

188 "version_dataset_title_url", 

189 "dataset_versions_url", 

190 "dataset_versions_title_url", 

191 ] 

192 

193 for fname in version_fields: 

194 fdef = all_fields.get(fname) 

195 if not fdef: 

196 add_field(fname, "string", indexed=True, stored=True, multi_valued=True) 

197 log.info("Added version field %s as multiValued string", fname) 

198 else: 

199 needs_update = ( 

200 fdef.get("type") != "string" 

201 or not fdef.get("indexed", False) 

202 or not fdef.get("stored", False) 

203 or not fdef.get("multiValued", False) 

204 ) 

205 if needs_update: 

206 delete_field(fname) 

207 add_field(fname, "string", indexed=True, stored=True, multi_valued=True) 

208 log.info("Replaced version field %s as multiValued string", fname)