Coverage for ckanext/udc/solr/solr.py: 90%

1"""

2Please note that applying this solr schema changes will require a reindex of the dataset.

3Probably also rebooting the solr server after the changes then reindex.

5Available Field Types: (Also see <solr_url>/schema/fieldtypes)

6----------------------

7- string:

8 - Exact match only, no tokenization or analysis.

9 - Use for IDs, keywords, or non-searchable fields.

10 - Used by license (id), tags, organization, title_string, url, version

11 - Used for retrieving facets.

13- text

14 - Natural language search (e.g., descriptions, long-form text)

16- text_general:

17 - Tokenized text with lowercase filter and word splitting.

18 - Good for full-text search.

19 - General text search without stemming (e.g., titles, keywords)

21- text_ngram:

22 - Tokenized text with lowercase filter, n-gram tokenization.

23 - Good for Autocomplete, Fuzzy Matching, Partial-Word Search

24 - Used by name_ngram, title_ngram.

26- boolean:

27 - Stores `true` or `false` values.

29- pint:

30 - 32-bit signed integer (numeric).

32- plong:

33 - 64-bit signed integer (for large numbers).

35- pfloat:

36 - Single-precision floating point number.

38- pdouble:

39 - Double-precision floating point number.

41- date:

42 - ISO 8601 format (`YYYY-MM-DDThh:mm:ssZ`).

43 - Supports range queries.

45"""

47from __future__ import annotations

48import logging

50from .helpers import get_fields, get_extras_fields, delete_extras_fields, add_copy_field, delete_copy_field, add_field, delete_field, ensure_language_dynamic_fields

51from .config import get_udc_langs

53log = logging.getLogger(__name__)

56def update_solr_maturity_model_fields(maturity_model: list):

57 """

58 Update Solr schema to include fields needed by the maturity model AND

59 multilingual search/facets.

61 Text fields in the maturity model are now multilingual JSON, so we DO NOT

62 create 'extras_<name>' text fields anymore. Instead, the indexer writes to:

63 <name>_<lang>_txt (search)

64 <name>_<lang>_f (facets)

66 Non-text types (date/number/select) still use 'extras_<name>'.

67 """

68 # 1) Ensure dynamic language fields

69 langs = get_udc_langs()

70 ensure_language_dynamic_fields(langs)

72 # 2) Build the static 'extras_*' fields for NON-text types only

73 new_fields = {}

74 for level in maturity_model:

75 for field in level.get("fields", []):

76 if "ckanField" in field:

77 continue

79 ftype = field.get("type")

80 name = field["name"]

81 key = f"extras_{name}"

83 # TEXT -> multilingual JSON now; do not add 'extras_<name>'

84 if ftype is None or ftype == "text":

85 continue

87 elif ftype in ("date", "datetime", "time"):

88 new_fields[key] = {

89 "name": key,

90 "type": "date",

91 "multiValued": False,

92 "indexed": True,

93 "stored": True,

94 "docValues": True,

95 }

97 elif ftype == "number":

98 new_fields[key] = {

99 "name": key,

100 "type": "pfloat",

101 "multiValued": False,

102 "indexed": True,

103 "stored": True,

104 "docValues": True,

105 }

106

107 elif ftype in ("multiple_select", "multiple_datasets"):

108 new_fields[key] = {

109 "name": key,

110 "type": "string",

111 "multiValued": True,

112 "stored": True,

113 "indexed": True,

114 }

115

116 elif ftype in ("single_select", "single_dataset"):

117 new_fields[key] = {

118 "name": key,

119 "type": "string",

120 "multiValued": False,

121 "indexed": True,

122 "stored": True,

123 }

124

125 else:

126 raise ValueError(f"Unknown field type: {ftype}")

127

128 # 3) Reconcile against existing 'extras_*' fields in Solr

129 current_fields = get_extras_fields()

130

131 # delete/replace changed or obsolete fields

132 for current_field_name, current_field in current_fields.items():

133 if current_field_name not in new_fields:

134 # If we used to index a text field here, drop it now (we're multilingual)

135 delete_field(current_field_name)

136 else:

137 desired = new_fields[current_field_name]

138 if (

139 desired["type"] != current_field["type"]

140 or desired["indexed"] != current_field["indexed"]

141 or desired["stored"] != current_field["stored"]

142 or desired["multiValued"] != current_field.get("multiValued", False)

143 or desired.get("docValues", False) != current_field.get("docValues", False)

144 ):

145 delete_field(current_field_name)

146 add_field(

147 desired["name"],

148 desired["type"],

149 desired["indexed"],

150 desired["stored"],

151 desired["multiValued"],

152 desired.get("docValues", False),

153 )

154 # remove from "to add"

155 new_fields.pop(current_field_name)

156

157 # add remaining new fields

158 for _, f in new_fields.items():

159 add_field(

160 f["name"],

161 f["type"],

162 f["indexed"],

163 f["stored"],

164 f["multiValued"],

165 f.get("docValues", False),

166 )

167

168 if not new_fields:

169 log.info("No new 'extras_*' fields to add.")

170 else:

171 log.info(f"Added {len(new_fields)} new 'extras_*' fields.")

172

173 # 4) Keep tags partial-search helper

174 all_fields = get_fields()

175 if "tags_ngram" not in all_fields:

176 add_field("tags_ngram", "text_ngram", indexed=True, stored=True, multi_valued=True)

177 add_copy_field("tags", "tags_ngram")

178 log.info("Added tags_ngram field.")

179 else:

180 log.info("tags_ngram field already exists.")

181

182 all_fields = get_fields()

183

184 # 5) Ensure version relationship helper fields exist and are multiValued strings

185 # These are populated by the before_dataset_index hook from JSON version metadata.

186 version_fields = [

187 "version_dataset_url",

188 "version_dataset_title_url",

189 "dataset_versions_url",

190 "dataset_versions_title_url",

191 ]

192

193 for fname in version_fields:

194 fdef = all_fields.get(fname)

195 if not fdef:

196 add_field(fname, "string", indexed=True, stored=True, multi_valued=True)

197 log.info("Added version field %s as multiValued string", fname)

198 else:

199 needs_update = (

200 fdef.get("type") != "string"

201 or not fdef.get("indexed", False)

202 or not fdef.get("stored", False)

203 or not fdef.get("multiValued", False)

204 )

205 if needs_update:

206 delete_field(fname)

207 add_field(fname, "string", indexed=True, stored=True, multi_valued=True)

208 log.info("Replaced version field %s as multiValued string", fname)