traject 3.1.0.rc1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +4 -0
- data/README.md +2 -0
- data/lib/traject/solr_json_writer.rb +20 -10
- data/lib/traject/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7bc0afc820efb8a2479d96913ae67552556978d264e20a55d0a4d4cef5ff9a2a
|
4
|
+
data.tar.gz: 773529d885d46d2ba0afd323d680724678225bd3616a720dba92809479395dce
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 395f1a3cf62cfd0cccbdcd428830feb098586c9ca1edc7adecff31483545a62dbc06631e7c166ea0ce1b229afeddcec7d994245d78871f1a6644c591d6cc432c
|
7
|
+
data.tar.gz: 647f9cfbf0a7a0876cef29b5a90a9fd6d3dd1f6a1010b332d58e132875b5c4a7f8be9bdab1afabb5ec0bc583af9ea0d8a422cc87eec6c04d93c9fe9f4b4d1e9a
|
data/CHANGES.md
CHANGED
@@ -24,6 +24,10 @@
|
|
24
24
|
|
25
25
|
* SolrJsonWriter now respects a `solr_writer.http_timeout` setting, in seconds, to be passed to HTTPClient instance. https://github.com/traject/traject/pull/219
|
26
26
|
|
27
|
+
* Only runs thread pool shutdown code (and logging) if there is a `solr_writer.batch_size` greater than 0. Keep it out of the logs if it was a no-op anyway.
|
28
|
+
|
29
|
+
* Logs at DEBUG level every time it sends an update request to solr
|
30
|
+
|
27
31
|
* Nokogiri dependency for the NokogiriReader increased to `~> 1.9`. When using Jruby `each_record_xpath`, resulting yielded documents may have xmlns declarations on different nodes than in MRI (and previous versions of nokogiri), but we could find now way around this with nokogiri >= 1.9.0. The documents should still be semantically equivalent for namespace use. This was necessary to keep JRuby Nokogiri XML working with recent Nokogiri releases. https://github.com/traject/traject/pull/209
|
28
32
|
|
29
33
|
* LineWriter guesses better about when to auto-close, and provides an optional explicit setting in case it guesses wrong. (thanks @justinlittman) https://github.com/traject/traject/pull/211
|
data/README.md
CHANGED
@@ -175,6 +175,8 @@ TranslationMap use above is just one example of a transformation macro, that tra
|
|
175
175
|
* `append("--after each value")`
|
176
176
|
* `gsub(/regex/, "replacement")`
|
177
177
|
* `split(" ")`: take values and split them, possibly result in multiple values.
|
178
|
+
* `transform(proc)`: transform each existing macro using a proc, kind of like `map`.
|
179
|
+
eg `to_field "something", extract_xml("//author"), transform( ->(author) { "#{author.last}, #{author.first}" })
|
178
180
|
|
179
181
|
You can add on as many transformation macros as you want, they will be applied to output in order.
|
180
182
|
|
@@ -185,6 +185,9 @@ class Traject::SolrJsonWriter
|
|
185
185
|
# @param [Array<Traject::Indexer::Context>] an array of contexts
|
186
186
|
def send_batch(batch)
|
187
187
|
return if batch.empty?
|
188
|
+
|
189
|
+
logger.debug("#{self.class.name}: sending batch of #{batch.size} to Solr")
|
190
|
+
|
188
191
|
json_package = JSON.generate(batch.map { |c| c.output_hash })
|
189
192
|
|
190
193
|
begin
|
@@ -209,12 +212,15 @@ class Traject::SolrJsonWriter
|
|
209
212
|
# Send a single context to Solr, logging an error if need be
|
210
213
|
# @param [Traject::Indexer::Context] c The context whose document you want to send
|
211
214
|
def send_single(c)
|
215
|
+
logger.debug("#{self.class.name}: sending single record to Solr: #{c.output_hash}")
|
216
|
+
|
212
217
|
json_package = JSON.generate([c.output_hash])
|
213
218
|
begin
|
214
|
-
|
219
|
+
post_url = solr_update_url_with_query(@solr_update_args)
|
220
|
+
resp = @http_client.post post_url, json_package, "Content-type" => "application/json"
|
215
221
|
|
216
222
|
unless resp.status == 200
|
217
|
-
raise BadHttpResponse.new("Unexpected HTTP response status #{resp.status}", resp)
|
223
|
+
raise BadHttpResponse.new("Unexpected HTTP response status #{resp.status} from POST #{post_url}", resp)
|
218
224
|
end
|
219
225
|
|
220
226
|
# Catch Timeouts and network errors -- as well as non-200 http responses --
|
@@ -234,7 +240,7 @@ class Traject::SolrJsonWriter
|
|
234
240
|
if @max_skipped and skipped_record_count > @max_skipped
|
235
241
|
# re-raising in rescue means the last encountered error will be available as #cause
|
236
242
|
# on raised exception, a feature in ruby 2.1+.
|
237
|
-
raise MaxSkippedRecordsExceeded.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting")
|
243
|
+
raise MaxSkippedRecordsExceeded.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting: #{exception.message}")
|
238
244
|
end
|
239
245
|
end
|
240
246
|
end
|
@@ -255,6 +261,8 @@ class Traject::SolrJsonWriter
|
|
255
261
|
# There is no built-in way to direct a record to be deleted from an indexing config
|
256
262
|
# file at the moment, this is just a loose method on the writer.
|
257
263
|
def delete(id)
|
264
|
+
logger.debug("#{self.class.name}: Sending delete to Solr for #{id}")
|
265
|
+
|
258
266
|
json_package = {delete: id}
|
259
267
|
resp = @http_client.post solr_update_url_with_query(@solr_update_args), JSON.generate(json_package), "Content-type" => "application/json"
|
260
268
|
if resp.status != 200
|
@@ -282,14 +290,16 @@ class Traject::SolrJsonWriter
|
|
282
290
|
@thread_pool.maybe_in_thread_pool { send_batch(batch) }
|
283
291
|
end
|
284
292
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
293
|
+
if @thread_pool_size && @thread_pool_size > 0
|
294
|
+
# Wait for shutdown, and time it.
|
295
|
+
logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
|
296
|
+
elapsed = @thread_pool.shutdown_and_wait
|
297
|
+
if elapsed > 60
|
298
|
+
logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
|
299
|
+
end
|
300
|
+
logger.debug "#{self.class.name}: Thread pool shutdown complete"
|
301
|
+
logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
290
302
|
end
|
291
|
-
logger.debug "#{self.class.name}: Thread pool shutdown complete"
|
292
|
-
logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
293
303
|
|
294
304
|
# check again now that we've waited, there could still be some
|
295
305
|
# that didn't show up before.
|
data/lib/traject/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.0
|
4
|
+
version: 3.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-04-
|
12
|
+
date: 2019-04-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|
@@ -388,9 +388,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
388
388
|
version: '0'
|
389
389
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
390
390
|
requirements:
|
391
|
-
- - "
|
391
|
+
- - ">="
|
392
392
|
- !ruby/object:Gem::Version
|
393
|
-
version:
|
393
|
+
version: '0'
|
394
394
|
requirements: []
|
395
395
|
rubyforge_project:
|
396
396
|
rubygems_version: 2.7.6
|