class Anystyle::Parser::Parser
Attributes
Public Class Methods
Returns a default parser instance
# File lib/anystyle/parser/parser.rb 43 def instance 44 @instance ||= new 45 end
# File lib/anystyle/parser/parser.rb 47 def language(string) 48 return unless @language_detector 49 @language_detector.detect string 50 end
# File lib/anystyle/parser/parser.rb 38 def load(path) 39 new :model => path 40 end
# File lib/anystyle/parser/parser.rb 57 def initialize(options = {}) 58 @options = Parser.defaults.merge(options) 59 60 reload 61 62 @normalizer = Normalizer.instance 63 end
Public Instance Methods
# File lib/anystyle/parser/parser.rb 205 def classify(hash) 206 return hash if hash.has_key?(:type) 207 208 keys = hash.keys 209 text = hash.values.flatten.join 210 211 case 212 when keys.include?(:journal) 213 hash[:type] = :article 214 when text =~ /proceedings/i 215 hash[:type] = :inproceedings 216 when keys.include?(:medium) 217 if hash[:medium].to_s =~ /dvd|video|vhs|motion|television/i 218 hash[:type] = :motion_picture 219 else 220 hash[:type] = hash[:medium] 221 end 222 when keys.include?(:booktitle), keys.include?(:source) 223 hash[:type] = :incollection 224 when keys.include?(:publisher) 225 hash[:type] = :book 226 when text =~ /ph(\.\s*)?d|diss(\.|ertation)|thesis/i 227 hash[:type] = :thesis 228 when text =~ /\b[Pp]atent\b/ 229 hash[:type] = :patent 230 when text =~ /\b[Pp]ersonal [Cc]ommunication\b/ 231 hash[:type] = :personal_communication 232 when keys.include?(:authority) 233 hash[:type] = :techreport 234 when text =~ /interview/i 235 hash[:type] = :interview 236 when text =~ /videotape/i 237 hash[:type] = :videotape 238 when text =~ /unpublished/i 239 hash[:type] = :unpublished 240 else 241 hash[:type] = :misc 242 end 243 244 hash 245 end
Expands the passed-in token string by appending a space separated list of all features for the token.
# File lib/anystyle/parser/parser.rb 148 def expand(token, sequence = [], offset = 0, label = nil) 149 f = features_for(token, strip(token), sequence, offset) 150 f.unshift(token) 151 f.push(label) unless label.nil? 152 f.join(' ') 153 end
Returns an array of label/segment pairs for each line in the passed-in string.
# File lib/anystyle/parser/parser.rb 81 def label(input, labelled = false) 82 model.label(prepare(input, labelled)).map! do |sequence| 83 sequence.inject([]) do |ts, (token, label)| 84 token, label = token[/^\S+/], label.to_sym 85 if (prev = ts[-1]) && prev[0] == label 86 prev[1] << ' ' << token 87 ts 88 else 89 ts << [label, token] 90 end 91 end 92 end 93 end
Trains the model by appending the training data without truncating the current model. @see train
# File lib/anystyle/parser/parser.rb 171 def learn(input) 172 train(input, false) 173 end
# File lib/anystyle/parser/parser.rb 127 def lines(string) 128 string.split(/[ \t]*[\n\r]\s*/) 129 end
# File lib/anystyle/parser/parser.rb 193 def localize(hash) 194 return hash if hash.has_key?(:language) 195 196 text = hash.values_at( 197 :title, :booktitle, :location, :publisher 198 ).compact.join(' ') 199 200 hash[:language] = Parser.language(text) unless text.empty? 201 202 hash 203 end
# File lib/anystyle/parser/parser.rb 180 def normalize(hash) 181 hash.keys.each do |label| 182 begin 183 normalizer.send("normalize_#{label}", hash) 184 rescue => e 185 warn e.message 186 end 187 end 188 189 classify hash 190 localize hash 191 end
# File lib/anystyle/parser/parser.rb 71 def parse(input, format = options[:format]) 72 formatter = "format_#{format}".to_sym 73 74 raise ArgumentError, "format not supported: #{formatter}" unless 75 respond_to?(formatter, true) 76 77 send(formatter, label(input)) 78 end
Prepares the passed-in string for processing by a CRF tagger. The string is split into separate lines; each line is tokenized and expanded. Returns an array of sequence arrays that can be labelled by the CRF model.
If the string is marked as being tagged by passing true
as the second argument, training labels will be extracted from the string and appended after feature expansion. The returned sequence arrays can be used for training or testing the CRF model.
# File lib/anystyle/parser/parser.rb 140 def prepare(input, tagged = false) 141 string = input_to_s(input) 142 tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } } 143 end
# File lib/anystyle/parser/parser.rb 65 def reload 66 @model = Wapiti.load(@options[:model]) 67 @model.options.update_attributes @options 68 self 69 end
# File lib/anystyle/parser/parser.rb 175 def test(input) 176 model.options.check! 177 model.label(prepare(input, true)) 178 end
Returns an array of tokens for each line of input.
If the passed-in string is marked as being tagged, extracts labels from the string and returns an array of token/label pairs for each line of input.
# File lib/anystyle/parser/parser.rb 100 def tokenize(string, tagged = false) 101 if tagged 102 lines(string).each_with_index.map do |s,i| 103 tt, tokens, tags = s.split(options[:tagged_separator]), [], [] 104 105 tt.each do |token| 106 case token 107 when /^$/ 108 # skip 109 when /^<([^\/>][^>]*)>$/ 110 tags << $1 111 when /^<\/([^>]+)>$/ 112 unless (tag = tags.pop) == $1 113 raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})" 114 end 115 else 116 tokens << [decode_xml_text(token), (tags[-1] || :unknown).to_sym] 117 end 118 end 119 120 tokens 121 end 122 else 123 lines(string).map { |s| s.split(options[:separator]).reject(&:empty?) } 124 end 125 end
# File lib/anystyle/parser/parser.rb 155 def train(input = options[:training_data], truncate = true) 156 if truncate 157 @model = Wapiti::Model.new(options.reject { |k,_| k == :model }) 158 end 159 160 unless input.nil? || input.empty? 161 @model.train(prepare(input, true)) 162 end 163 164 @model.path = options[:model] 165 @model 166 end
Private Instance Methods
# File lib/anystyle/parser/parser.rb 275 def decode_xml_text(string) 276 string.gsub(/&(amp|gt|lt);/) do |match| 277 options[:xml_entities][match] 278 end 279 end
# File lib/anystyle/parser/parser.rb 281 def encode_xml_text(string) 282 string.encode string.encoding, :xml => :text 283 end
# File lib/anystyle/parser/parser.rb 267 def features_for(*arguments) 268 Parser.features.map { |f| f.match(*arguments) } 269 end
# File lib/anystyle/parser/parser.rb 285 def format_bibtex(labels) 286 b = BibTeX::Bibliography.new 287 format_normalized(labels).each do |hash| 288 hash[:bibtex_type] = hash.delete :type 289 290 hash[:type] = hash.delete :genre if hash.key?(:genre) 291 hash[:address] = hash.delete :location if hash.key?(:location) 292 hash[:urldate] = hash.delete :accessed if hash.key?(:accessed) 293 294 if hash.key?(:authority) 295 if [:techreport,:thesis].include?(hash[:bibtex_type]) 296 hash[:institution] = hash.delete :authority 297 else 298 hash[:organization] = hash.delete :authority 299 end 300 end 301 302 b << BibTeX::Entry.new(hash) 303 end 304 b 305 end
# File lib/anystyle/parser/parser.rb 332 def format_citeproc(labels) 333 format_bibtex(labels).to_citeproc 334 end
# File lib/anystyle/parser/parser.rb 315 def format_hash(labels) 316 labels.map do |line| 317 line.inject({}) do |h, (label, token)| 318 if h.has_key?(label) 319 h[label] = [h[label]].flatten << token 320 else 321 h[label] = token 322 end 323 h 324 end 325 end 326 end
# File lib/anystyle/parser/parser.rb 328 def format_normalized(labels) 329 format_hash(labels).map { |h| normalize h } 330 end
# File lib/anystyle/parser/parser.rb 307 def format_raw(labels) 308 labels.map do |line| 309 line.inject([]) do |tokens, (label, segment)| 310 tokens.concat segment.split(' ').map { |token| [label, token] } 311 end 312 end 313 end
# File lib/anystyle/parser/parser.rb 342 def format_xml(labels) 343 xml = Builder::XmlMarkup.new 344 xml.instruct! :xml, encoding: 'UTF-8' 345 346 xml.references do |rs| 347 labels.each do |line| 348 rs.reference do |r| 349 line.each do |label, segment| 350 r.tag! label, segment 351 end 352 end 353 end 354 end 355 end
# File lib/anystyle/parser/parser.rb 249 def input_to_s(input) 250 case input 251 when String 252 if !input.tainted? && input.length < 128 && File.exists?(input) 253 f = File.open(input, 'r:UTF-8') 254 f.read 255 else 256 input 257 end 258 when Array 259 input.join("\n") 260 else 261 raise ArgumentError, "invalid input: #{input.class}" 262 end 263 ensure 264 f.close unless f.nil? 265 end
# File lib/anystyle/parser/parser.rb 271 def strip(token) 272 token.gsub(options[:strip], '') 273 end