class Anystyle::Parser::Normalizer
Constants
- MONTH
Public Instance Methods
extract_edition(token, hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 220 def extract_edition(token, hash) 221 edition = [hash[:edition]].flatten.compact 222 223 if token.gsub!(/[^[:alnum:]]*(\d+)(?:st|nd|rd|th)?\s*(?:Aufl(?:age|\.)|ed(?:ition|\.)?)[^[:alnum:]]*/i, '') 224 edition << $1 225 end 226 227 if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ee]xpanded)[^[:alnum:]]*$/, '') 228 edition << $1 229 end 230 231 if token.gsub!(/(?:\band)?[^[:alnum:]]*([Ii]llustrated)[^[:alnum:]]*$/, '') 232 edition << $1 233 end 234 235 if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]evised)[^[:alnum:]]*$/, '') 236 edition << $1 237 end 238 239 if token.gsub!(/(?:\band)?[^[:alnum:]]*([Rr]eprint)[^[:alnum:]]*$/, '') 240 edition << $1 241 end 242 243 hash[:edition] = edition.join(', ') unless edition.empty? 244 end
method_missing(name, *arguments, &block)
click to toggle source
Calls superclass method
# File lib/anystyle/parser/normalizer.rb 40 def method_missing(name, *arguments, &block) 41 case name.to_s 42 when /^normalize_(.+)$/ 43 normalize($1.to_sym, *arguments, &block) 44 else 45 super 46 end 47 end
normalize(key, hash)
click to toggle source
Default normalizer. Strips punctuation.
# File lib/anystyle/parser/normalizer.rb 50 def normalize(key, hash) 51 token, *dangling = hash[key] 52 unmatched(key, hash, dangling) unless dangling.empty? 53 54 token.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '') 55 56 hash[key] = token 57 hash 58 end
normalize_accessed(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 60 def normalize_accessed(hash) 61 token, *dangling = hash[:accessed] 62 unmatched(:accessed, hash, dangling) unless dangling.empty? 63 64 token.gsub!(/(accessed|retrieved):?\s*/i, '') 65 66 hash[:accessed] = token 67 hash 68 end
normalize_booktitle(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 246 def normalize_booktitle(hash) 247 booktitle, *dangling = hash[:booktitle] 248 unmatched(:booktitle, hash, dangling) unless dangling.empty? 249 250 booktitle.gsub!(/^in:\s+/i, '') 251 booktitle.gsub!(/^In\s+/i, '') 252 253 extract_edition(booktitle, hash) 254 255 booktitle.gsub!(/^\s+|[\.,:;\s]+$/, '') 256 hash[:booktitle] = booktitle 257 258 hash 259 end
normalize_citation_number(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 81 def normalize_citation_number(hash) 82 token, *dangling = hash[:citation_number] 83 unmatched(:citation_number, hash, dangling) unless dangling.empty? 84 85 hash[:citation_number] = token[/\d[\w,.-]+/] || token 86 hash 87 end
normalize_date(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 286 def normalize_date(hash) 287 date = Array(hash[:date]).join(' ') 288 289 unless (month = MONTH[date]).nil? 290 month = '%02d' % month 291 end 292 293 if date =~ /(\d{4})/ 294 year = $1 295 296 if month && date =~ /\b(\d{1,2})\b/ 297 day = '%02d' % $1.to_i 298 end 299 300 hash[:date] = [year, month, day].compact.join('-') 301 end 302 303 hash 304 end
normalize_director(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 156 def normalize_director(hash) 157 directors = hash[:director] 158 159 directors.gsub!(/^\W+|\W+$/, '') 160 directors.gsub!(/[^[:alpha:]]*direct(or|ed)?\b:w 161 [^[:alpha:]]*/i, '') 162 directors.gsub!(/\bby\b/i, '') 163 164 hash[:director] = normalize_names(directors) 165 hash 166 end
normalize_editor(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 105 def normalize_editor(hash) 106 editors, *dangling = hash[:editor] 107 108 unless dangling.empty? 109 case 110 when !hash.has_key?(:author) 111 hash[:author] = editors 112 hash[:editor] = dangling 113 hash = normalize_author(hash) 114 return normalize_editor(hash) 115 when dangling[0] =~ /(\d+)/ 116 hash[:edition] = $1.to_i 117 else 118 unmatched(:editor, hash, dangling) 119 end 120 end 121 122 hash[:'more-editors'] = true if strip_et_al(editors) 123 124 editors.gsub!(/^\W+|\W+$/, '') 125 editors.gsub!(/^in:?\s+/i, '') 126 editors.gsub!(/\W*\b[Ee]d(s|itors?|ited)?\b\W*/, '') 127 editors.gsub!(/\W*\b([Hh]rsg|gg?|Herausgeber)\b\W*/, '') 128 editors.gsub!(/\b[Hh]erausgegeben von\b/, '') 129 editors.gsub!(/\bby\b/i, '') 130 131 is_trans = !!editors.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '') 132 133 hash[:editor] = normalize_names(editors) 134 hash[:translator] = hash.delete :editor if is_trans 135 136 hash 137 end
normalize_isbn(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 389 def normalize_isbn(hash) 390 isbn, *dangling = hash[:isbn] 391 unmatched(:isbn, hash, dangling) unless dangling.empty? 392 393 isbn = isbn[/[\d-]+/] 394 hash[:isbn] = isbn 395 396 hash 397 end
normalize_journal(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 261 def normalize_journal(hash) 262 journal, *dangling = hash[:journal] 263 unmatched(:journal, hash, dangling) unless dangling.empty? 264 265 journal.gsub!(/^[\s]+|[\.,:;\s]+$/, '') 266 hash[:journal] = journal 267 268 hash 269 end
normalize_key(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 70 def normalize_key(hash) 71 token, *dangling = hash[:key] 72 unmatched(:key, hash, dangling) unless dangling.empty? 73 74 token.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '') 75 token.gsub!(/^bibitem\{/i, '') 76 77 hash[:key] = token 78 hash 79 end
normalize_location(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 374 def normalize_location(hash) 375 location, *dangling = hash[:location] 376 unmatched(:pages, hash, dangling) unless dangling.empty? 377 378 location.gsub!(/^[^[:alnum:]]+|[^[:alnum:]]+$/, '') 379 380 if !hash.has_key?(:publisher) && location =~ /:/ 381 location, publisher = location.split(/\s*:\s*/) 382 hash[:publisher] = publisher 383 end 384 385 hash[:location] = location 386 hash 387 end
normalize_medium(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 407 def normalize_medium(hash) 408 medium, *dangling = hash[:medium] 409 unmatched(:medium, hash, dangling) unless dangling.empty? 410 411 hash[:medium] = medium.split(/\W+/).reject(&:empty?).join('-') 412 hash 413 end
normalize_names(names)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 179 def normalize_names(names) 180 names.gsub!(/\s*(\.\.\.|…)\s*/, '') 181 names.gsub!(/;|:/, ',') 182 183 # Add surname/initial punctuation separator for Vancouver-style names 184 # E.g. Rang HP, Dale MM, Ritter JM, Moore PK 185 if names.match(/^(\p{Lu}[^\s,.]+)\s+([\p{Lu}][\p{Lu}\-]{0,3})(,|[.]?$)/) 186 names.gsub!(/\b(\p{Lu}[^\s,.]+)\s+([\p{Lu}][\p{Lu}\-]{0,3})(,|[.]?$)/, '\1, \2\3') 187 end 188 189 Namae.parse!(names).map { |name| 190 name.normalize_initials 191 name.sort_order 192 193 }.join(' and ') 194 195 rescue => e 196 warn e.message 197 names 198 end
normalize_pages(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 351 def normalize_pages(hash) 352 pages, *dangling = hash[:pages] 353 unmatched(:pages, hash, dangling) unless dangling.empty? 354 355 # "volume.issue(year):pp" 356 case pages 357 when /(\d+) (?: \.(\d+))? (?: \( (\d{4}) \))? : (\d.*)/x 358 hash[:volume] = $1.to_i 359 hash[:number] = $2.to_i unless $2.nil? 360 hash[:year] = $3.to_i unless $3.nil? 361 hash[:pages] = $4 362 end 363 364 case hash[:pages] 365 when /(\d+)\D+(\d+)/ 366 hash[:pages] = [$1,$2].join('–') # en-dash 367 when /(\d+)/ 368 hash[:pages] = $1 369 end 370 371 hash 372 end
normalize_producer(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 168 def normalize_producer(hash) 169 producers = hash[:producer] 170 171 producers.gsub!(/^\W+|\W+$/, '') 172 producers.gsub!(/[^[:alpha:]]*produc(er|ed)?[^[:alpha:]]*/i, '') 173 producers.gsub!(/\bby\b/i, '') 174 175 hash[:director] = normalize_names(producers) 176 hash 177 end
normalize_publisher(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 333 def normalize_publisher(hash) 334 normalize :publisher, hash 335 336 case hash[:publisher] 337 when /^producers?$/i 338 hash[:publisher] = hash[:producer] 339 340 when /^authors?$/i 341 hash[:publisher] = hash[:author] 342 343 when /^editor?$/i 344 hash[:publisher] = hash[:editor] 345 346 end 347 348 hash 349 end
normalize_source(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 271 def normalize_source(hash) 272 source, *dangling = hash[:source] 273 unmatched(:source, hash, dangling) unless dangling.empty? 274 275 case source 276 when /dissertation abstracts/i 277 source.gsub!(/\s*section \w: ([[:alnum:]\s]+).*$/i, '') 278 hash[:category] = $1 unless $1.nil? 279 hash[:type] = :thesis 280 end 281 282 hash[:source] = source 283 hash 284 end
normalize_title(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 202 def normalize_title(hash) 203 title, source = hash[:title] 204 205 unless source.nil? 206 hash[:source] = source 207 normalize(:source, hash) 208 end 209 210 extract_edition(title, hash) 211 212 title.gsub!(/^\s+|[\.,:;\s]+$/, '') 213 title.gsub!(/^["'”’´‘“`](.+)["'”’´‘“`]$/, '\1') 214 215 hash[:title] = title 216 217 hash 218 end
normalize_translator(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 143 def normalize_translator(hash) 144 translators = hash[:translator] 145 146 translators.gsub!(/\b([Ii]n (d|ein)er )?[Üü]ber(s\.|setzt|setzung|tragen|tragung) v(\.|on\b)/, '') 147 translators.gsub!(/^\W+|\W+$/, '') 148 translators.gsub!(/[^[:alpha:]]*\btrans(l(ated)?)?\b[^[:alpha:]]*/i, '') 149 translators.gsub!(/\bby\b/i, '') 150 translators.gsub!(/\btrad\./i, '') 151 152 hash[:translator] = normalize_names(translators) 153 hash 154 end
normalize_url(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 399 def normalize_url(hash) 400 url, *dangling = hash[:url] 401 unmatched(:url, hash, dangling) unless dangling.empty? 402 403 hash[:url] = url[/([a-z]+:\/\/)?\w+\.\w+[\w\.\/%-]+/i] || url 404 hash 405 end
normalize_volume(hash)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 306 def normalize_volume(hash) 307 volume, *dangling = hash[:volume] 308 unmatched(:volume, hash, dangling) unless dangling.empty? 309 310 if !hash.has_key?(:pages) && volume =~ /\D*(\d+):(\d+(?:[—–-]+)\d+)/ 311 hash[:volume], hash[:pages] = $1.to_i, $2 312 hash = normalize_pages(hash) 313 else 314 case volume 315 when /\D*(\d+)\D+(\d+[\s\/&—–-]+\d+)/ 316 hash[:volume], hash[:number] = $1.to_i, $2 317 when /(\d+)?\D+no\.\s*(\d+\D+\d+)/ 318 hash[:volume] = $1.to_i unless $1.nil? 319 hash[:number] = $2 320 when /(\d+)?\D+no\.\s*(\d+)/ 321 hash[:volume] = $1.to_i unless $1.nil? 322 hash[:number] = $2.to_i 323 when /\D*(\d+)\D+(\d+)/ 324 hash[:volume], hash[:number] = $1.to_i, $2.to_i 325 when /(\d+)/ 326 hash[:volume] = $1.to_i 327 end 328 end 329 330 hash 331 end
strip_et_al(names)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 139 def strip_et_al(names) 140 !!names.sub!(/(\bet\s+(al|coll)\b|\bu\.\s*a\.|(\band|\&)\s+others).*$/, '') 141 end
Private Instance Methods
unmatched(label, hash, tokens)
click to toggle source
# File lib/anystyle/parser/normalizer.rb 417 def unmatched(label, hash, tokens) 418 hash["unmatched-#{label}"] = tokens.join(' ') 419 end