class Anystyle::Parser::Normalizer

Constants

MONTH

Public Instance Methods

extract_edition(token, hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
220 def extract_edition(token, hash)
221   edition = [hash[:edition]].flatten.compact
222 
223   if token.gsub!(/[^[:alnum:]]*(\d+)(?:st|nd|rd|th)?\s*(?:Aufl(?:age|\.)|ed(?:ition|\.)?)[^[:alnum:]]*/i, '')
224     edition << $1
225   end
226 
227   if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ee]xpanded)[^[:alnum:]]*$/, '')
228     edition << $1
229   end
230 
231   if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ii]llustrated)[^[:alnum:]]*$/, '')
232     edition << $1
233   end
234 
235   if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]evised)[^[:alnum:]]*$/, '')
236     edition << $1
237   end
238 
239   if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]eprint)[^[:alnum:]]*$/, '')
240     edition << $1
241   end
242 
243   hash[:edition] = edition.join(', ') unless edition.empty?
244 end
method_missing(name, *arguments, &block) click to toggle source
Calls superclass method
   # File lib/anystyle/parser/normalizer.rb
40 def method_missing(name, *arguments, &block)
41   case name.to_s
42   when /^normalize_(.+)$/
43     normalize($1.to_sym, *arguments, &block)
44   else
45     super
46   end
47 end
normalize(key, hash) click to toggle source

Default normalizer. Strips punctuation.

   # File lib/anystyle/parser/normalizer.rb
50 def normalize(key, hash)
51   token, *dangling =  hash[key]
52   unmatched(key, hash, dangling) unless dangling.empty?
53 
54   token.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
55 
56   hash[key] = token
57   hash
58 end
normalize_accessed(hash) click to toggle source
   # File lib/anystyle/parser/normalizer.rb
60 def normalize_accessed(hash)
61   token, *dangling =  hash[:accessed]
62   unmatched(:accessed, hash, dangling) unless dangling.empty?
63 
64   token.gsub!(/(accessed|retrieved):?\s*/i, '')
65 
66   hash[:accessed] = token
67   hash
68 end
normalize_author(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
 89 def normalize_author(hash)
 90   authors, *dangling = hash[:author]
 91   unmatched(:author, hash, dangling) unless dangling.empty?
 92 
 93   if authors =~ /\b[Ee]d((s|itors?|ited)\b|\.)/ && !hash.has_key?(:editor)
 94     hash[:editor] = hash.delete(:author)
 95     hash = normalize_editor(hash)
 96   else
 97     hash[:'more-authors'] = true if strip_et_al(authors)
 98     authors.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
 99     hash[:author] = normalize_names(authors)
100   end
101 
102   hash
103 end
normalize_booktitle(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
246 def normalize_booktitle(hash)
247   booktitle, *dangling = hash[:booktitle]
248   unmatched(:booktitle, hash, dangling) unless dangling.empty?
249 
250   booktitle.gsub!(/^in:\s+/i, '')
251   booktitle.gsub!(/^In\s+/i, '')
252 
253   extract_edition(booktitle, hash)
254 
255   booktitle.gsub!(/^\s+|[\.,:;\s]+$/, '')
256   hash[:booktitle] = booktitle
257 
258   hash
259 end
normalize_citation_number(hash) click to toggle source
   # File lib/anystyle/parser/normalizer.rb
81 def normalize_citation_number(hash)
82   token, *dangling =  hash[:citation_number]
83   unmatched(:citation_number, hash, dangling) unless dangling.empty?
84 
85   hash[:citation_number] = token[/\d[\w,.-]+/] || token
86   hash
87 end
normalize_date(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
286 def normalize_date(hash)
287   date = Array(hash[:date]).join(' ')
288 
289   unless (month = MONTH[date]).nil?
290     month = '%02d' % month
291   end
292 
293   if date =~ /(\d{4})/
294     year = $1
295 
296     if month && date =~ /\b(\d{1,2})\b/
297       day = '%02d' % $1.to_i
298     end
299 
300     hash[:date] = [year, month, day].compact.join('-')
301   end
302 
303   hash
304 end
normalize_director(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
156 def normalize_director(hash)
157   directors = hash[:director]
158 
159   directors.gsub!(/^\W+|\W+$/, '')
160   directors.gsub!(/[^[:alpha:]]*direct(or|ed)?\b:w
161                   [^[:alpha:]]*/i, '')
162   directors.gsub!(/\bby\b/i, '')
163 
164   hash[:director] = normalize_names(directors)
165   hash
166 end
normalize_editor(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
105 def normalize_editor(hash)
106   editors, *dangling = hash[:editor]
107 
108   unless dangling.empty?
109     case
110     when !hash.has_key?(:author)
111       hash[:author] = editors
112       hash[:editor] = dangling
113       hash = normalize_author(hash)
114       return normalize_editor(hash)
115     when dangling[0] =~ /(\d+)/
116       hash[:edition] = $1.to_i
117     else
118       unmatched(:editor, hash, dangling)
119     end
120   end
121 
122   hash[:'more-editors'] = true if strip_et_al(editors)
123 
124   editors.gsub!(/^\W+|\W+$/, '')
125   editors.gsub!(/^in:?\s+/i, '')
126   editors.gsub!(/\W*\b[Ee]d(s|itors?|ited)?\b\W*/, '')
127   editors.gsub!(/\W*\b([Hh]rsg|gg?|Herausgeber)\b\W*/, '')
128   editors.gsub!(/\b[Hh]erausgegeben von\b/, '')
129   editors.gsub!(/\bby\b/i, '')
130 
131   is_trans = !!editors.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '')
132 
133   hash[:editor] = normalize_names(editors)
134   hash[:translator] = hash.delete :editor if is_trans
135 
136   hash
137 end
normalize_isbn(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
389 def normalize_isbn(hash)
390   isbn, *dangling = hash[:isbn]
391   unmatched(:isbn, hash, dangling) unless dangling.empty?
392 
393   isbn = isbn[/[\d-]+/]
394   hash[:isbn] = isbn
395 
396   hash
397 end
normalize_journal(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
261 def normalize_journal(hash)
262   journal, *dangling = hash[:journal]
263   unmatched(:journal, hash, dangling) unless dangling.empty?
264 
265   journal.gsub!(/^[\s]+|[\.,:;\s]+$/, '')
266   hash[:journal] = journal
267 
268   hash
269 end
normalize_key(hash) click to toggle source
   # File lib/anystyle/parser/normalizer.rb
70 def normalize_key(hash)
71   token, *dangling =  hash[:key]
72   unmatched(:key, hash, dangling) unless dangling.empty?
73 
74   token.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
75   token.gsub!(/^bibitem\{/i, '')
76 
77   hash[:key] = token
78   hash
79 end
normalize_location(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
374 def normalize_location(hash)
375   location, *dangling = hash[:location]
376   unmatched(:pages, hash, dangling) unless dangling.empty?
377 
378   location.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '')
379 
380   if !hash.has_key?(:publisher) && location =~ /:/
381     location, publisher = location.split(/\s*:\s*/)
382     hash[:publisher] = publisher
383   end
384 
385   hash[:location] = location
386   hash
387 end
normalize_medium(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
407 def normalize_medium(hash)
408   medium, *dangling = hash[:medium]
409   unmatched(:medium, hash, dangling) unless dangling.empty?
410 
411   hash[:medium] = medium.split(/\W+/).reject(&:empty?).join('-')
412   hash
413 end
normalize_names(names) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
179 def normalize_names(names)
180   names.gsub!(/\s*(\.\.\.|…)\s*/, '')
181   names.gsub!(/;|:/, ',')
182 
183   # Add surname/initial punctuation separator for Vancouver-style names
184   # E.g. Rang HP, Dale MM, Ritter JM, Moore PK
185   if names.match(/^(\p{Lu}[^\s,.]+)\s+([\p{Lu}][\p{Lu}\-]{0,3})(,|[.]?$)/)
186     names.gsub!(/\b(\p{Lu}[^\s,.]+)\s+([\p{Lu}][\p{Lu}\-]{0,3})(,|[.]?$)/, '\1, \2\3')
187   end
188 
189   Namae.parse!(names).map { |name|
190     name.normalize_initials
191     name.sort_order
192 
193   }.join(' and ')
194 
195 rescue => e
196   warn e.message
197   names
198 end
normalize_pages(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
351 def normalize_pages(hash)
352   pages, *dangling = hash[:pages]
353   unmatched(:pages, hash, dangling) unless dangling.empty?
354 
355   # "volume.issue(year):pp"
356   case pages
357   when /(\d+) (?: \.(\d+))? (?: \( (\d{4}) \))? : (\d.*)/x
358     hash[:volume] = $1.to_i
359     hash[:number] = $2.to_i unless $2.nil?
360     hash[:year] = $3.to_i unless $3.nil?
361     hash[:pages] = $4
362   end
363 
364   case hash[:pages]
365   when /(\d+)\D+(\d+)/
366     hash[:pages] = [$1,$2].join('–') # en-dash
367   when  /(\d+)/
368     hash[:pages] = $1
369   end
370 
371   hash
372 end
normalize_producer(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
168 def normalize_producer(hash)
169   producers = hash[:producer]
170 
171   producers.gsub!(/^\W+|\W+$/, '')
172   producers.gsub!(/[^[:alpha:]]*produc(er|ed)?[^[:alpha:]]*/i, '')
173   producers.gsub!(/\bby\b/i, '')
174 
175   hash[:director] = normalize_names(producers)
176   hash
177 end
normalize_publisher(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
333 def normalize_publisher(hash)
334   normalize :publisher, hash
335 
336   case hash[:publisher]
337   when /^producers?$/i
338     hash[:publisher] = hash[:producer]
339 
340   when /^authors?$/i
341     hash[:publisher] = hash[:author]
342 
343   when /^editor?$/i
344     hash[:publisher] = hash[:editor]
345 
346   end
347 
348   hash
349 end
normalize_source(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
271 def normalize_source(hash)
272   source, *dangling = hash[:source]
273   unmatched(:source, hash, dangling) unless dangling.empty?
274 
275   case source
276   when /dissertation abstracts/i
277     source.gsub!(/\s*section \w: ([[:alnum:]\s]+).*$/i, '')
278     hash[:category] = $1 unless $1.nil?
279     hash[:type] = :thesis
280   end
281 
282   hash[:source] = source
283   hash
284 end
normalize_title(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
202 def normalize_title(hash)
203   title, source = hash[:title]
204 
205   unless source.nil?
206     hash[:source] = source
207     normalize(:source, hash)
208   end
209 
210   extract_edition(title, hash)
211 
212   title.gsub!(/^\s+|[\.,:;\s]+$/, '')
213   title.gsub!(/^["'”’´‘“`](.+)["'”’´‘“`]$/, '\1')
214 
215   hash[:title] = title
216 
217   hash
218 end
normalize_translator(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
143 def normalize_translator(hash)
144   translators = hash[:translator]
145 
146   translators.gsub!(/\b([Ii]n (d|ein)er )?[Üü]ber(s\.|setzt|setzung|tragen|tragung) v(\.|on\b)/, '')
147   translators.gsub!(/^\W+|\W+$/, '')
148   translators.gsub!(/[^[:alpha:]]*\btrans(l(ated)?)?\b[^[:alpha:]]*/i, '')
149   translators.gsub!(/\bby\b/i, '')
150   translators.gsub!(/\btrad\./i, '')
151 
152   hash[:translator] = normalize_names(translators)
153   hash
154 end
normalize_url(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
399 def normalize_url(hash)
400   url, *dangling = hash[:url]
401   unmatched(:url, hash, dangling) unless dangling.empty?
402 
403   hash[:url] = url[/([a-z]+:\/\/)?\w+\.\w+[\w\.\/%-]+/i] || url
404   hash
405 end
normalize_volume(hash) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
306 def normalize_volume(hash)
307   volume, *dangling = hash[:volume]
308   unmatched(:volume, hash, dangling) unless dangling.empty?
309 
310   if !hash.has_key?(:pages) && volume =~ /\D*(\d+):(\d+(?:[—–-]+)\d+)/
311     hash[:volume], hash[:pages] = $1.to_i, $2
312     hash = normalize_pages(hash)
313   else
314     case volume
315     when /\D*(\d+)\D+(\d+[\s\/&—–-]+\d+)/
316       hash[:volume], hash[:number] = $1.to_i, $2
317     when /(\d+)?\D+no\.\s*(\d+\D+\d+)/
318       hash[:volume] = $1.to_i unless $1.nil?
319       hash[:number] = $2
320     when /(\d+)?\D+no\.\s*(\d+)/
321       hash[:volume] = $1.to_i unless $1.nil?
322       hash[:number] = $2.to_i
323     when /\D*(\d+)\D+(\d+)/
324       hash[:volume], hash[:number] = $1.to_i, $2.to_i
325     when /(\d+)/
326       hash[:volume] = $1.to_i
327     end
328   end
329 
330   hash
331 end
strip_et_al(names) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
139 def strip_et_al(names)
140   !!names.sub!(/(\bet\s+(al|coll)\b|\bu\.\s*a\.|(\band|\&)\s+others).*$/, '')
141 end

Private Instance Methods

unmatched(label, hash, tokens) click to toggle source
    # File lib/anystyle/parser/normalizer.rb
417 def unmatched(label, hash, tokens)
418   hash["unmatched-#{label}"] = tokens.join(' ')
419 end