sdr-replication 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e99a0814e4383ec6287dec7df41825d786e65919
4
+ data.tar.gz: 4890c2758dd820f22ce8aa9015621e1b33dec9ae
5
+ SHA512:
6
+ metadata.gz: deb3400e53fcbdf16cf8263ffe1deed70be83889633b4fecfa0de58e41a076e7f502de61d22f079b09ff6b81ddb29586bb6e898782f22e45010ecfd46d93828d
7
+ data.tar.gz: a0533d23addcc264e7aeca97f985bdad1d70e8081230e09ab5337a03314b058374541b1587f47a5f89dd2f558115677190d49b320b892a77e7031bb15ac9c3ff
data/lib/libdir.rb ADDED
@@ -0,0 +1,3 @@
1
+ libdir = File.expand_path(File.join(File.dirname(__FILE__)))
2
+ $LOAD_PATH.unshift(libdir) unless $LOAD_PATH.include?(libdir)
3
+
@@ -0,0 +1,110 @@
1
+ require 'rubygems'
2
+ require 'rest-client'
3
+
4
+ module Replication
5
+
6
+ # A wrapper class based on {RestClient} used to interface with the Archive Catalog service.
7
+ # <br>
8
+ # <br>
9
+ # The default RestClient behavior is:
10
+ # * for results code between 200 and 207 a RestClient::Response will be returned
11
+ # * for results code 301, 302 or 307 the redirection will be followed if the request is a get or a head
12
+ # * for result code 303 the redirection will be followed and the request transformed into a get
13
+ # * for other cases a RestClient::Exception holding the Response will be raised
14
+ #
15
+ # But we are using a technique that forces RestClient to always provide the response
16
+ # <br>
17
+ # <br>
18
+ # RestClient::Response has these instance methods (some inherited from AbstractResponse):
19
+ # * args
20
+ # * body
21
+ # * code (e.g. 204)
22
+ # * description (e.g. "204 No Content | 0 bytes")
23
+ # * headers
24
+ # * net_http_res
25
+ #
26
+ # @see https://github.com/rest-client/rest-client
27
+ # @see http://rubydoc.info/gems/rest-client/1.6.7/frames
28
+ class ArchiveCatalog
29
+
30
+ @root_uri = 'http://localhost:3000'
31
+ @timeout = 120
32
+
33
+ # @see https://www.google.com/search?q="class+<<+self"+"attr_accessor"
34
+ class << self
35
+
36
+ # @return [String] The base or home URL of the Archive Catalog web service
37
+ attr_accessor :root_uri
38
+
39
+ # @return [Integer] seconds to wait for a response or to open a connection. Value nil disables the timeout.
40
+ attr_accessor :timeout
41
+
42
+ # The base RestClient resource to be used for requests
43
+ def root_resource
44
+ RestClient::Resource.new(@root_uri, {:open_timeout => @timeout, :timeout => @timeout})
45
+ end
46
+
47
+ # Get the item record from the specified table for the specified primary key.
48
+ # @param [String] table name of the database table
49
+ # @param [String] id primary key for the item in the database table
50
+ # @return [Hash] the row (in key,value hash) from the specified table for the specified identifier.
51
+ # Response body contains the item data in JSON format, which is converted to a hash.
52
+ # @see http://tools.ietf.org/html/rfc2616#page-53
53
+ def get_item(table,id)
54
+ # Don't raise RestClient::Exception but return the response
55
+ headers = {:accept => 'application/json'}
56
+ response = root_resource["#{table}/#{id}.json"].get(headers) {|response, request, result| response }
57
+ case response.code.to_s
58
+ when '200'
59
+ JSON.parse(response.body)
60
+ else
61
+ raise response.description
62
+ end
63
+ end
64
+
65
+ # Retrieve an existing database record or add a new one using the data provided.
66
+ # @param [String] table name of the database table
67
+ # @param [Hash] hash the item data to be added to the database table
68
+ # @return [Hash] result containing the item data as if a GET were performed.
69
+ # The HTTP response code for success is 201 (Created).
70
+ # @see http://en.wikipedia.org/wiki/POST_(HTTP)
71
+ # @see http://tools.ietf.org/html/rfc2616#page-54
72
+ def find_or_create_item(table,hash)
73
+ payload = hash.to_json
74
+ headers = {:content_type => :json, :accept => :json}
75
+ # Don't raise RestClient::Exception but return the response
76
+ response = root_resource["#{table}.json"].post(payload, headers) {|response, request, result| response }
77
+ case response.code.to_s
78
+ when '201'
79
+ JSON.parse(response.body)
80
+ else
81
+ raise response.description
82
+ end
83
+ end
84
+
85
+ # Update the database columns for the specified item using the hash data.
86
+ # @param [String] table name of the database table
87
+ # @param [String] id primary key for the item in the database table
88
+ # @param [Hash] hash the item data to be updated in the database table
89
+ # @return (Boolean) true if the HTTP response code is 204, per specification for PATCH or PUT request types.
90
+ # Response body is empty, per same specification.
91
+ # @see https://tools.ietf.org/html/rfc5789
92
+ # @see http://stackoverflow.com/questions/797834/should-a-restful-put-operation-return-something/827045#827045
93
+ def update_item(table,id,hash)
94
+ payload = hash.to_json
95
+ headers = {:content_type => :json}
96
+ # Don't raise RestClient::Exception but return the response
97
+ response = root_resource["#{table}/#{id}.json"].patch(payload, headers) {|response, request, result| response }
98
+ case response.code.to_s
99
+ when '204'
100
+ true
101
+ else
102
+ raise response.description
103
+ end
104
+ end
105
+
106
+ end
107
+
108
+ end
109
+
110
+ end
@@ -0,0 +1,337 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'sdr_replication'
3
+
4
+ module Replication
5
+
6
+ # A BagIt bag contains a structured copy of a digital object for storage, transfer, or replication
7
+ # @see https://tools.ietf.org/html/draft-kunze-bagit-10
8
+ # This class can be used to create, parse, or validate a bag instance
9
+ #
10
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
11
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
12
+ class BagitBag
13
+
14
+ # @param [Pathname,String] pathname The location of the bag home directory
15
+ # @return [BagitBag] Initialize a new bag, create home and payload folders, write bagit.txt file
16
+ def BagitBag.create_bag(pathname)
17
+ bag = BagitBag.new
18
+ bag.bag_pathname = pathname
19
+ bag.payload_pathname.mkpath
20
+ bag.write_bagit_txt
21
+ bag
22
+ end
23
+
24
+ # @param [Pathname,String] pathname The location of the bag home directory
25
+ # @return [BagitBag] Initialize a new bag, create home and payload folders, write bagit.txt file
26
+ def BagitBag.open_bag(pathname)
27
+ bag = BagitBag.new
28
+ bag.bag_pathname = pathname
29
+ raise "No bag found at #{bag.bag_pathname}" unless bag.bag_pathname.exist?
30
+ bagit_txt = bag.bag_pathname.join("bagit.txt")
31
+ raise "No bagit.txt file found at #{bagit_txt}" unless bagit_txt.exist?
32
+ bag
33
+ end
34
+
35
+ # @return [Pathname] The location of the bag home directory
36
+ def bag_pathname
37
+ @bag_pathname
38
+ end
39
+
40
+ # @param [Pathname,String] pathname The location of the bag home directory
41
+ # @return [Void] Set the location of the bag home directory
42
+ def bag_pathname=(pathname)
43
+ @bag_pathname = Pathname(pathname)
44
+ end
45
+
46
+ # @return [Pathname] The location of the bag data directory
47
+ def payload_pathname
48
+ bag_pathname.join('data')
49
+ end
50
+
51
+ # @return [Pathname] Generate the bagit.txt tag file
52
+ def write_bagit_txt
53
+ bagit_txt = bag_pathname.join("bagit.txt")
54
+ bagit_txt.open('w') do |f|
55
+ f.puts "Tag-File-Character-Encoding: UTF-8"
56
+ f.puts "BagIt-Version: 0.97"
57
+ end
58
+ bagit_txt
59
+ end
60
+
61
+ # @return [Hash<String,String] A hash containing the properties documented in the bagit.txt tagfile
62
+ def read_bagit_txt
63
+ properties = Hash.new
64
+ bagit_txt = bag_pathname.join("bagit.txt")
65
+ bagit_txt.readlines.each do |line|
66
+ line.chomp!.strip!
67
+ key,value = line.split(':',2)
68
+ properties[key.strip] = value.strip if value
69
+ end
70
+ properties
71
+ end
72
+
73
+ # @return [Array<Symbol>] The list of checksum types to be used when generating fixity data
74
+ def bag_checksum_types
75
+ @bag_checksum_types ||= Fixity.default_checksum_types
76
+ end
77
+
78
+ # @param [Object] types The list of checksum types to be used when generating fixity data
79
+ # @return [Void] Set the list of checksum types to be used when generating fixity data
80
+ def bag_checksum_types=(*types)
81
+ @bag_checksum_types = Fixity.validate_checksum_types(*types)
82
+ end
83
+
84
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
85
+ # @param [Pathname] source_dir The source location of the directory whose contents are to be ingested
86
+ # @return [Pathname] Generate file_fixity_hash and send it to #add_payload_files
87
+ def add_payload_dir (link_mode, source_dir)
88
+ file_fixity_hash = Fixity.generate_checksums(source_dir, nil ,bag_checksum_types)
89
+ add_payload_files(link_mode, source_dir, file_fixity_hash)
90
+ payload_pathname
91
+ end
92
+
93
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
94
+ # @param [Pathname] source_basepath The source location of the directory whose contents are to be ingested
95
+ # @param [Hash<String,FileFixity>] file_fixity_hash The list of files (with fixity data) to be added to the payload
96
+ # @return [Pathname] Copy or link the files specified in the file_fixity_hash to the payload directory,
97
+ # then update the payload manifest files
98
+ def add_payload_files(link_mode, source_basepath, file_fixity_hash)
99
+ file_fixity_hash.keys.each do |file_id|
100
+ source_pathname = source_basepath.join(file_id)
101
+ target_pathname = payload_pathname.join(file_id)
102
+ copy_file(link_mode, source_pathname, target_pathname)
103
+ end
104
+ write_manifest_checksums('manifest', file_fixity_hash)
105
+ payload_pathname
106
+ end
107
+
108
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
109
+ # @param [Pathname] source_pathname The source location of the file to be ingested
110
+ # @param [Pathname] target_pathname The location of the directory in which to place the file
111
+ # @return [Pathname] link or copy the specified file from source location to the target location
112
+ def copy_file(link_mode, source_pathname, target_pathname)
113
+ target_pathname.parent.mkpath
114
+ case link_mode
115
+ when :copy, nil
116
+ FileUtils.copy(source_pathname.to_s, target_pathname.to_s) # automatically dereferences symlinks
117
+ when :link
118
+ FileUtils.link(source_pathname.to_s, target_pathname.to_s) #, :force => true (false is default)
119
+ when :symlink
120
+ FileUtils.symlink(source_pathname.to_s, target_pathname.to_s) #, :force => true (false is default)
121
+ else
122
+ raise "Invalid link_mode: #{link_mode}, expected one of [:copy,:link,:symlink]"
123
+ end
124
+ target_pathname
125
+ end
126
+
127
+ # @param [Pathname,String] source_fullpath The location of the directory whose content will be tarred
128
+ # @param [Pathname,String] source_basepath The location of the directory to change to before doing the tar create
129
+ # @return [Tarfile] Create a tar archive of a directory into the payload directory,
130
+ # generating checksums in parallel processes and recording those checksums in the payload manifests
131
+ def add_payload_tarfile(tarfile_id,source_fullpath, source_basepath)
132
+ tarfile = Tarfile.new
133
+ tarfile.source_basepath = Pathname(source_basepath)
134
+ tarfile.source_fullpath = Pathname(source_fullpath)
135
+ tarfile.tarfile_basepath = payload_pathname
136
+ tarfile.tarfile_fullpath = payload_pathname.join("#{tarfile_id}")
137
+ tarfile.create_tarfile
138
+ file_fixity_hash = Fixity.generate_checksums(tarfile.tarfile_basepath,[tarfile.tarfile_fullpath],bag_checksum_types)
139
+ write_manifest_checksums('manifest', file_fixity_hash)
140
+ tarfile
141
+ end
142
+
143
+ # @return [Pathname] Generate the bag-info.txt tag file to record the payload size
144
+ def write_bag_info_txt
145
+ payload_size = bag_payload_size
146
+ bag_info_txt = bag_pathname.join("bag-info.txt")
147
+ bag_info_txt.open('w') do |f|
148
+ f.puts "External-Identifier: #{bag_pathname.basename}"
149
+ f.puts "Payload-Oxum: #{payload_size[:bytes]}.#{payload_size[:files]}"
150
+ f.puts "Bag-Size: #{bag_size_human(payload_size[:bytes])}"
151
+ end
152
+ bag_info_txt
153
+ end
154
+
155
+ # @return [Hash<Symbol,Integer>] A hash contining the payload size in bytes, and the number of files,
156
+ # derived from the payload directory contents
157
+ def bag_payload_size
158
+ payload_pathname.find.select{|f| f.file?}.inject({bytes: 0, files: 0}) do |hash,file|
159
+ hash[:bytes] += file.size
160
+ hash[:files] += 1
161
+ hash
162
+ end
163
+ end
164
+
165
+ # @param [Integer] bytes The total number of bytes in the payload
166
+ # @return [String] Human-readable rendition of the total payload size
167
+ def bag_size_human(bytes)
168
+ count = 0
169
+ size = bytes
170
+ while ( size >= 1024 and count < 4 )
171
+ size /= 1024.0
172
+ count += 1
173
+ end
174
+ if (count == 0)
175
+ return sprintf("%d B", size)
176
+ else
177
+ return sprintf("%.2f %s", size, %w[B KB MB GB TB][count] )
178
+ end
179
+ end
180
+
181
+ # @return [Hash<String,String] A hash containing the properties documented in the bag-info.txt tagfile
182
+ def read_bag_info_txt
183
+ properties = Hash.new
184
+ bag_info = bag_pathname.join("bag-info.txt")
185
+ bag_info.readlines.each do |line|
186
+ line.chomp!.strip!
187
+ key,value = line.split(':',2)
188
+ properties[key.strip] = value.strip if value
189
+ end
190
+ properties
191
+ end
192
+
193
+ # @return [Hash<Symbol,Integer>] A hash contining the payload size in bytes, and the number of files,
194
+ # derived from the Payload-Oxum property
195
+ def info_payload_size
196
+ info = read_bag_info_txt
197
+ size_array = info['Payload-Oxum'].split('.')
198
+ size_hash = {:bytes => size_array[0].to_i, :files => size_array[1].to_i}
199
+ size_hash
200
+ end
201
+
202
+ # @return [Boolean] Compare the actual measured payload size against the value recorded in bag-info.txt
203
+ def verify_payload_size
204
+ info_size = info_payload_size
205
+ bag_size = bag_payload_size
206
+ if info_size != bag_size
207
+ raise "Failed payload size verification! Expected: #{info_size}, Found: #{bag_size}"
208
+ end
209
+ true
210
+ end
211
+
212
+ # @return [Hash<String,FileFixity>] create hash containing ids and checksums for all files in the bag's root directory
213
+ def generate_tagfile_checksums
214
+ tagfiles = bag_pathname.children.reject{|file| file.basename.to_s.start_with?('tagmanifest')}
215
+ Fixity.generate_checksums(bag_pathname, tagfiles, bag_checksum_types )
216
+ end
217
+
218
+ # @return [Hash<String,FileFixity>] create hash containing ids and checksums for all files in the bag's payload
219
+ def generate_payload_checksums
220
+ Fixity.generate_checksums(payload_pathname, nil, bag_checksum_types)
221
+ end
222
+
223
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be updated
224
+ # @param [Hash<String,FileFixity>] file_fixity_hash A hash containing file ids and fixity data
225
+ # @param [String] open_mode The file open mode (default is 'a')
226
+ # @return [Hash<Symbol,Pathname] Update each of the manifests with data from the file_fixity_hash
227
+ def write_manifest_checksums(manifest_type, file_fixity_hash, open_mode='a')
228
+ manifests = Hash.new
229
+ self.bag_checksum_types.each do |checksum_type|
230
+ manifest_pathname = bag_pathname.join("#{manifest_type}-#{checksum_type}.txt")
231
+ manifest_file = manifest_pathname.open(open_mode)
232
+ file_fixity_hash.values.each do |fixity|
233
+ checksum = fixity.get_checksum(checksum_type)
234
+ manifest_file.puts("#{checksum} #{fixity.file_id}") if checksum
235
+ end
236
+ manifest_file.close
237
+ manifests[checksum_type] = manifest_pathname
238
+ end
239
+ manifests
240
+ end
241
+
242
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be read
243
+ # @return [Hash<String,FileFixity>] A hash containing file ids and fixity data derived from the manifest files
244
+ def read_manifest_files(manifest_type)
245
+ file_fixity_hash = Hash.new
246
+ checksum_type_list = Array.new
247
+ Fixity.valid_checksum_ids.each do |checksum_type|
248
+ manifest_pathname = bag_pathname.join("#{manifest_type}-#{checksum_type}.txt")
249
+ if manifest_pathname.file?
250
+ checksum_type_list << checksum_type
251
+ manifest_pathname.readlines.each do |line|
252
+ line.chomp!.strip!
253
+ checksum,file_id = line.split(/[\s*]+/,2)
254
+ file_fixity = file_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
255
+ file_fixity.set_checksum(checksum_type,checksum)
256
+ file_fixity_hash[file_id] = file_fixity
257
+ end
258
+ end
259
+ end
260
+ self.bag_checksum_types = self.bag_checksum_types | checksum_type_list
261
+ file_fixity_hash
262
+ end
263
+
264
+ # @return [Boolean] Compare fixity data from the tag manifest files against the values measured by digesting the files
265
+ def verify_tagfile_manifests
266
+ manifest_type = 'tagmanifest'
267
+ manifest_fixity_hash = read_manifest_files(manifest_type)
268
+ bag_fixity_hash = generate_tagfile_checksums
269
+ verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
270
+ end
271
+
272
+ # @return [Boolean] Compare fixity data from the payload manifest files against the values measured by digesting the files
273
+ def verify_payload_manifests
274
+ manifest_type = 'manifest'
275
+ manifest_fixity_hash = read_manifest_files(manifest_type)
276
+ bag_fixity_hash = generate_payload_checksums
277
+ verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
278
+ end
279
+
280
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be read
281
+ # @param [Hash<String,FileFixity>] manifest_fixity_hash A hash containing file ids and fixity data derived from the manifest files
282
+ # @param [Hash<String,FileFixity>] bag_fixity_hash A hash containing file ids and fixity data derived from the actual files
283
+ # @return [Boolean] Compare fixity data from the manifest files against the values measured by digesting the files,
284
+ # returning true if equal or false if not equal
285
+ def verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
286
+ diff = manifest_diff(manifest_fixity_hash, bag_fixity_hash)
287
+ if diff.size > 0
288
+ raise "Failed #{manifest_type} verification! Differences: \n#{diff.inspect}"
289
+ end
290
+ true
291
+ end
292
+
293
+ # @param [Hash<String,FileFixity>] manifest_fixity_hash A hash containing file ids and fixity data derived from the manifest files
294
+ # @param [Hash<String,FileFixity>] bag_fixity_hash A hash containing file ids and fixity data derived from the actual files
295
+ # @return [Hash] A report of the differences between the fixity data from the manifest files
296
+ # against the values measured by digesting the files
297
+ def manifest_diff(manifest_fixity_hash, bag_fixity_hash)
298
+ diff = Hash.new
299
+ (manifest_fixity_hash.keys | bag_fixity_hash.keys).each do |file_id|
300
+ manifest_fixity = manifest_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
301
+ bag_fixity = bag_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
302
+ if manifest_fixity != bag_fixity
303
+ diff[file_id] = manifest_fixity.diff(bag_fixity,'manifest','bag')
304
+ end
305
+ end
306
+ diff
307
+ end
308
+
309
+ # @return [Boolean] Validate the bag containing the digital object
310
+ def verify_bag
311
+ verify_bag_structure
312
+ verify_tagfile_manifests
313
+ verify_payload_size
314
+ verify_payload_manifests
315
+ true
316
+ end
317
+
318
+ # @return [Boolean] Test the existence of expected files, return true if files exist, raise exception if not
319
+ def verify_bag_structure
320
+ required_files = ['data','bagit.txt','bag-info.txt','manifest-sha256.txt','tagmanifest-sha256.txt']
321
+ required_files.each{|filename| verify_pathname(bag_pathname.join(filename))}
322
+ optional_files = []
323
+ true
324
+ end
325
+
326
+ # @param [Pathname] pathname The file whose existence should be verified
327
+ # @return [Boolean] Test the existence of the specified path. Return true if file exists, raise exception if not
328
+ def verify_pathname(pathname)
329
+ raise "#{pathname.basename} not found at #{pathname}" unless pathname.exist?
330
+ true
331
+ end
332
+
333
+
334
+ end
335
+
336
+
337
+ end
OSZAR »