stanford-mods-normalizer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0a0d58dd1d4b6d6fcfa74b8338e9af8d0d95b62aa8f7967ead17018fb2d616fe
4
+ data.tar.gz: 39381c4aefad607ed2602296e13904b606e102d3a6e5e7d18de3ca37380845dc
5
+ SHA512:
6
+ metadata.gz: e5226a45ff7ccf1d1f27b54082f35fe5c95ea12730961c35444fb79ded554ed943cabf8d9e30ac54b76006650779ec86fc33be294092bf1a07ffac859fb3ccbc
7
+ data.tar.gz: '09d6b33969eb6a38c24fa4696e9754cdcb105046cfa7ddec2990fba5d2cfef21423509718a0da68ea541d8f68d29c1660cb1896b3e71a255d667213814c6e0b8'
@@ -0,0 +1,5 @@
1
+ # rspec failure tracking
2
+ .rspec_status
3
+
4
+ Gemfile.lock
5
+ pkg/
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,8 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
3
+ Metrics/LineLength:
4
+ Max: 140
5
+
6
+ Metrics/BlockLength:
7
+ Exclude:
8
+ - 'spec/**/*_spec.rb'
@@ -0,0 +1,48 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2018-03-14 09:10:19 -0500 using RuboCop version 0.53.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 3
10
+ Metrics/AbcSize:
11
+ Max: 30
12
+
13
+ # Offense count: 2
14
+ # Configuration parameters: CountComments, ExcludedMethods.
15
+ Metrics/BlockLength:
16
+ Max: 116
17
+
18
+ # Offense count: 1
19
+ # Configuration parameters: CountComments.
20
+ Metrics/ClassLength:
21
+ Max: 110
22
+
23
+ # Offense count: 1
24
+ Metrics/CyclomaticComplexity:
25
+ Max: 8
26
+
27
+ # Offense count: 4
28
+ # Configuration parameters: CountComments.
29
+ Metrics/MethodLength:
30
+ Max: 14
31
+
32
+ # Offense count: 2
33
+ Metrics/PerceivedComplexity:
34
+ Max: 8
35
+
36
+ # Offense count: 1
37
+ # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
38
+ # AllowedNames: io, id
39
+ Naming/UncommunicativeMethodParamName:
40
+ Exclude:
41
+ - 'lib/stanford/mods/normalizer.rb'
42
+
43
+ # Offense count: 1
44
+ Style/Documentation:
45
+ Exclude:
46
+ - 'spec/**/*'
47
+ - 'test/**/*'
48
+ - 'lib/stanford/mods/normalizer.rb'
@@ -0,0 +1,4 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.6
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in stanford-mods-normalizer.gemspec
6
+ gemspec
@@ -0,0 +1,5 @@
1
+ [![Build Status](https://travis-ci.org/sul-dlss/mods_normalizer.svg?branch=master)](https://travis-ci.org/sul-dlss/mods_normalizer)
2
+
3
+ # Stanford::Mods::Normalizer
4
+
5
+ Provides methods to normalize MODS XML according to the Stanford guidelines
@@ -0,0 +1,14 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+ require 'rubocop/rake_task'
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+
7
+ desc 'Run style checker'
8
+ RuboCop::RakeTask.new(:rubocop) do |task|
9
+ task.fail_on_error = true
10
+ end
11
+
12
+ task default: :ci
13
+
14
+ task ci: %i[rubocop spec]
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stanford/mods/normalizer/version'
4
+
5
+ module Stanford
6
+ module Mods
7
+ class Normalizer
8
+ # Your code goes here...
9
+ require 'nokogiri'
10
+
11
+ # Linefeed character entity reference
12
+ LINEFEED = '
'.freeze
13
+
14
+ # Select all single <dateCreated> and <dateIssued> fields
15
+ LONE_DATE_XPATH = '//mods:originInfo/mods:dateCreated[1][not(following-sibling::*[1][self::mods:dateCreated])]' \
16
+ ' | //mods:originInfo/mods:dateIssued[1][not(following-sibling::*[1][self::mods:dateIssued])]'.freeze
17
+
18
+ # Select all <dateCreated> and <dateIssued> fields
19
+ DATE_CREATED_ISSUED_XPATH = '//mods:dateCreated | //mods:dateIssued'.freeze
20
+
21
+ # The official MODS namespace, courtesy of the Library of Congress
22
+ MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'.freeze
23
+
24
+ # Selects <abstract>, <tableOfContents> and <note> when no namespace is present
25
+ LINEFEED_XPATH = '//abstract | //tableOfContents | //note'.freeze
26
+
27
+ # Selects <abstract>, <tableOfContents> and <note> when a namespace is present
28
+ LINEFEED_XPATH_NAMESPACED = '//ns:abstract | //ns:tableOfContents | //ns:note'.freeze
29
+
30
+ # Checks if a node has attributes that we make exeptions for. There are two such exceptions.
31
+ #
32
+ # * A "collection" attribute with the value "yes" <em>on a typeOfResource tag</em>.
33
+ # * A "manuscript" attribute with the value "yes" <em>on a typeOfResource tag</em>.
34
+ #
35
+ # Nodes that fall under any of these exceptions should not be deleted, even if they have no content.
36
+ #
37
+ # @param [Nokogiri::XML::Element] node An XML node.
38
+ # @return [Boolean] true if the node contains any of the exceptional attributes, false otherwise.
39
+ def exceptional?(node)
40
+ return false if node.nil?
41
+
42
+ tag = node.name
43
+ attributes = node.attributes
44
+
45
+ return false if attributes.empty?
46
+
47
+ attributes.each do |key, value|
48
+ next unless tag == 'typeOfResource'
49
+ # Note that according to the MODS schema, any other value than 'yes' for these attributes is invalid
50
+ if (key == 'collection' && value.to_s.casecmp('yes').zero?) ||
51
+ (key == 'manuscript' && value.to_s.casecmp('yes').zero?)
52
+ return true
53
+ end
54
+ end
55
+ false
56
+ end
57
+
58
+ # Recursive helper method for {Normalizer#clean_linefeeds} to do string substitution.
59
+ #
60
+ # @param [Nokogiri::XML::Element] node An XML node
61
+ # @return [String] A string composed of the entire contents of the given node,
62
+ # with substitutions made as described for {#clean_linefeeds}.
63
+ def substitute_linefeeds(node)
64
+ new_text = ''
65
+
66
+ # If we substitute in '&#10;' by itself, Nokogiri interprets that and then prints '&amp;#10;' when printing the document later. This
67
+ # is an ugly way to add linefeed characters in a way that we at least get well-formatted output in the end.
68
+ if node.text?
69
+ new_text = node.content.gsub(/(\r\n|\n|\r|\\n)/, Nokogiri::HTML(LINEFEED).text)
70
+ else
71
+ if node.node_name == 'br'
72
+ new_text += Nokogiri::HTML(LINEFEED).text
73
+ elsif node.node_name == 'p'
74
+ new_text += Nokogiri::HTML(LINEFEED).text + Nokogiri::HTML(LINEFEED).text
75
+ end
76
+
77
+ node.children.each do |c|
78
+ new_text += substitute_linefeeds(c)
79
+ end
80
+ end
81
+ new_text
82
+ end
83
+
84
+ # Given the root of an XML document, replaces linefeed characters inside <tableOfContents>, <abstract> and <note> XML node by &#10;
85
+ # \n, \r, <br> and <br/> are all replaced by a single &#10;
86
+ # <p> is replaced by two &#10;
87
+ # </p> is removed
88
+ # \r\n is replaced by &#10;
89
+ # Any tags not listed above are removed. MODS 3.5 does not allow for anything other than text inside these three nodes.
90
+ #
91
+ # @param [Nokogiri::XML::NodeSet] node_list All <tableOfContents>, <abstract> and <node> elements.
92
+ # @return [Void] This method doesn't return anything, but introduces
93
+ # UTF-8 linefeed characters in place, as described above.
94
+ def clean_linefeeds(node_list)
95
+ node_list.each do |current_node|
96
+ new_text = substitute_linefeeds(current_node)
97
+ current_node.children.remove
98
+ current_node.content = new_text
99
+ end
100
+ end
101
+
102
+ # Cleans up the text of a node:
103
+ #
104
+ # * Removes extra whitespace at the beginning and end.
105
+ # * Removes any consecutive whitespace within the string.
106
+ #
107
+ # @param [String] s The text of an XML node.
108
+ # @return [String] The cleaned string, as described. Returns nil if the input is nil, or if the input is an empty string.
109
+ def clean_text(s)
110
+ return nil unless !s.nil? && s != ''
111
+ s.gsub(/\s+/, ' ').strip
112
+ end
113
+
114
+ # Removes empty attributes from a given node.
115
+ #
116
+ # @param [Nokogiri::XML::Element] node An XML node.
117
+ # @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
118
+ def remove_empty_attributes(node)
119
+ children = node.children
120
+ attributes = node.attributes
121
+
122
+ attributes.each do |key, value|
123
+ node.remove_attribute(key) if value.to_s.strip.empty?
124
+ end
125
+
126
+ children.each do |c|
127
+ remove_empty_attributes(c)
128
+ end
129
+ end
130
+
131
+ # Removes empty nodes from an XML tree. See {#exceptional?} for nodes that are kept even if empty.
132
+ #
133
+ # @param [Nokogiri::XML::Element] node An XML node.
134
+ # @return [Void] This method doesn't return anything, but modifies the XML tree starting at the given node.
135
+ def remove_empty_nodes(node)
136
+ children = node.children
137
+
138
+ if node.text?
139
+ return node.remove if node.to_s.strip.empty?
140
+ return
141
+ elsif !children.empty?
142
+ children.each do |c|
143
+ remove_empty_nodes(c)
144
+ end
145
+ end
146
+
147
+ node.remove if !exceptional?(node) && node.children.empty?
148
+ end
149
+
150
+ # Removes leading and trailing spaces from a node.
151
+ #
152
+ # @param [Nokogiri::XML::Element] node An XML node.
153
+ # @return [Void] This method doesn't return anything, but modifies the entire XML tree starting at the
154
+ # the given node, removing leading and trailing spaces from all text. If the input is nil,
155
+ # an exception will be raised.
156
+ def trim_text(node)
157
+ children = node.children
158
+
159
+ if node.text?
160
+ node.parent.content = node.text.strip
161
+ else
162
+ children.each do |c|
163
+ trim_text(c)
164
+ end
165
+ end
166
+ end
167
+
168
+ # Sometimes there are spurious decimal digits within the date fields. This method removes any trailing decimal points within
169
+ # <dateCreated> and <dateIssued>.
170
+ #
171
+ # @param [Nokogiri::XML::NodeSet] nodes A set of all affected <dateCreated> and <dateIssued> elements.
172
+ # @return [Void] The given document is modified in place.
173
+ def clean_date_values(nodes)
174
+ nodes.each do |current_node|
175
+ current_node.content = current_node.content.sub(/(.*)\.\d+$/, '\1')
176
+ end
177
+ end
178
+
179
+ # Normalizes the given MODS XML document according to the Stanford guidelines.
180
+ #
181
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
182
+ # @return [Void] The given document is modified in place.
183
+ def normalize_mods_document(root)
184
+ node_list = if root.namespace.nil?
185
+ root.xpath(LINEFEED_XPATH)
186
+ else
187
+ root.xpath(LINEFEED_XPATH_NAMESPACED, 'ns' => root.namespace.href)
188
+ end
189
+ clean_linefeeds(node_list) # Do this before deleting <br> and <p> with remove_empty_nodes()
190
+
191
+ remove_empty_attributes(root)
192
+ remove_empty_nodes(root)
193
+ trim_text(root)
194
+ clean_date_values(root.xpath(DATE_CREATED_ISSUED_XPATH, 'mods' => MODS_NAMESPACE))
195
+ end
196
+
197
+ # Normalizes the given MODS XML document according to the Stanford guidelines.
198
+ #
199
+ # @deprecated Use normalize_mods_document instead.
200
+ # @param [Nokogiri::XML::Element] root The root of a MODS XML document.
201
+ # @return [Void] The given document is modified in place.
202
+ def normalize_document(root)
203
+ normalize_mods_document(root)
204
+ end
205
+
206
+ # Normalizes the given XML document string according to the Stanford guidelines.
207
+ #
208
+ # @param [String] xml_string An XML document
209
+ # @return [String] The XML string, with normalizations applied.
210
+ def normalize_xml_string(xml_string)
211
+ doc = Nokogiri::XML(xml_string)
212
+ normalize_document(doc.root)
213
+ doc.to_s
214
+ end
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,7 @@
1
+ module Stanford
2
+ module Mods
3
+ class Normalizer
4
+ VERSION = '0.1.0'.freeze
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,30 @@
1
+
2
+ lib = File.expand_path('lib', __dir__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'stanford/mods/normalizer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'stanford-mods-normalizer'
8
+ spec.version = Stanford::Mods::Normalizer::VERSION
9
+ spec.authors = ['Justin Coyne']
10
+ spec.email = ['[email protected]']
11
+
12
+ spec.summary = 'Provides methods to normalize MODS XML according to the Stanford guidelines '
13
+ spec.homepage = 'https://github.com/sul-dlss/mods_normalizer'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
16
+ f.match(%r{^(test|spec|features)/})
17
+ end
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_dependency 'nokogiri', '~> 1.8'
23
+ spec.add_development_dependency 'rubocop', '~> 0.53'
24
+ spec.add_development_dependency 'rubocop-rspec', '~> 0.18'
25
+
26
+ spec.add_development_dependency 'bundler', '~> 1.16'
27
+ spec.add_development_dependency 'equivalent-xml', '>= 0.6.0'
28
+ spec.add_development_dependency 'rake', '~> 10.0'
29
+ spec.add_development_dependency 'rspec', '~> 3.0'
30
+ end
metadata ADDED
@@ -0,0 +1,152 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: stanford-mods-normalizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Justin Coyne
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-03-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rubocop
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.53'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.53'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rubocop-rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.18'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.18'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.16'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.16'
69
+ - !ruby/object:Gem::Dependency
70
+ name: equivalent-xml
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: 0.6.0
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 0.6.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.0'
111
+ description:
112
+ email:
113
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".gitignore"
119
+ - ".rspec"
120
+ - ".rubocop.yml"
121
+ - ".rubocop_todo.yml"
122
+ - ".travis.yml"
123
+ - Gemfile
124
+ - README.md
125
+ - Rakefile
126
+ - lib/stanford/mods/normalizer.rb
127
+ - lib/stanford/mods/normalizer/version.rb
128
+ - stanford-mods-normalizer.gemspec
129
+ homepage: https://github.com/sul-dlss/mods_normalizer
130
+ licenses: []
131
+ metadata: {}
132
+ post_install_message:
133
+ rdoc_options: []
134
+ require_paths:
135
+ - lib
136
+ required_ruby_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ required_rubygems_version: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ requirements: []
147
+ rubyforge_project:
148
+ rubygems_version: 2.7.1
149
+ signing_key:
150
+ specification_version: 4
151
+ summary: Provides methods to normalize MODS XML according to the Stanford guidelines
152
+ test_files: []
OSZAR »