class Anystyle::Parser::Parser

Attributes

defaults[R]
feature[R]
features[R]
formats[R]
model[RW]
normalizer[RW]
options[R]

Public Class Methods

instance() click to toggle source

Returns a default parser instance

   # File lib/anystyle/parser/parser.rb
43 def instance
44   @instance ||= new
45 end
language(string) click to toggle source
   # File lib/anystyle/parser/parser.rb
47 def language(string)
48   return unless @language_detector
49   @language_detector.detect string
50 end
load(path) click to toggle source
   # File lib/anystyle/parser/parser.rb
38 def load(path)
39   new :model => path
40 end
new(options = {}) click to toggle source
   # File lib/anystyle/parser/parser.rb
57 def initialize(options = {})
58   @options = Parser.defaults.merge(options)
59 
60   reload
61 
62   @normalizer = Normalizer.instance
63 end

Public Instance Methods

classify(hash) click to toggle source
    # File lib/anystyle/parser/parser.rb
205 def classify(hash)
206   return hash if hash.has_key?(:type)
207 
208   keys = hash.keys
209   text = hash.values.flatten.join
210 
211   case
212   when keys.include?(:journal)
213     hash[:type] = :article
214   when text =~ /proceedings/i
215     hash[:type] = :inproceedings
216   when keys.include?(:medium)
217     if hash[:medium].to_s =~ /dvd|video|vhs|motion|television/i
218       hash[:type] = :motion_picture
219     else
220       hash[:type] = hash[:medium]
221     end
222   when keys.include?(:booktitle), keys.include?(:source)
223     hash[:type] = :incollection
224   when keys.include?(:publisher)
225     hash[:type] = :book
226   when text =~ /ph(\.\s*)?d|diss(\.|ertation)|thesis/i
227     hash[:type] = :thesis
228   when text =~ /\b[Pp]atent\b/
229     hash[:type] = :patent
230   when text =~ /\b[Pp]ersonal [Cc]ommunication\b/
231     hash[:type] = :personal_communication
232   when keys.include?(:authority)
233     hash[:type] = :techreport
234   when text =~ /interview/i
235     hash[:type] = :interview
236   when text =~ /videotape/i
237     hash[:type] = :videotape
238   when text =~ /unpublished/i
239     hash[:type] = :unpublished
240   else
241     hash[:type] = :misc
242   end
243 
244   hash
245 end
expand(token, sequence = [], offset = 0, label = nil) click to toggle source

Expands the passed-in token string by appending a space separated list of all features for the token.

    # File lib/anystyle/parser/parser.rb
148 def expand(token, sequence = [], offset = 0, label = nil)
149   f = features_for(token, strip(token), sequence, offset)
150   f.unshift(token)
151   f.push(label) unless label.nil?
152   f.join(' ')
153 end
label(input, labelled = false) click to toggle source

Returns an array of label/segment pairs for each line in the passed-in string.

   # File lib/anystyle/parser/parser.rb
81 def label(input, labelled = false)
82   model.label(prepare(input, labelled)).map! do |sequence|
83     sequence.inject([]) do |ts, (token, label)|
84       token, label = token[/^\S+/], label.to_sym
85       if (prev = ts[-1]) && prev[0] == label
86         prev[1] << ' ' << token
87         ts
88       else
89         ts << [label, token]
90       end
91     end
92   end
93 end
learn(input) click to toggle source

Trains the model by appending the training data without truncating the current model. @see train

    # File lib/anystyle/parser/parser.rb
171 def learn(input)
172   train(input, false)
173 end
lines(string) click to toggle source
    # File lib/anystyle/parser/parser.rb
127 def lines(string)
128   string.split(/[ \t]*[\n\r]\s*/)
129 end
localize(hash) click to toggle source
    # File lib/anystyle/parser/parser.rb
193 def localize(hash)
194   return hash if hash.has_key?(:language)
195 
196   text = hash.values_at(
197     :title, :booktitle, :location, :publisher
198   ).compact.join(' ')
199 
200   hash[:language] = Parser.language(text) unless text.empty?
201 
202   hash
203 end
normalize(hash) click to toggle source
    # File lib/anystyle/parser/parser.rb
180 def normalize(hash)
181   hash.keys.each do |label|
182     begin
183       normalizer.send("normalize_#{label}", hash)
184     rescue => e
185       warn e.message
186     end
187   end
188 
189   classify hash
190   localize hash
191 end
parse(input, format = options[:format]) click to toggle source
   # File lib/anystyle/parser/parser.rb
71 def parse(input, format = options[:format])
72   formatter = "format_#{format}".to_sym
73 
74   raise ArgumentError, "format not supported: #{formatter}" unless
75     respond_to?(formatter, true)
76 
77   send(formatter, label(input))
78 end
prepare(input, tagged = false) click to toggle source

Prepares the passed-in string for processing by a CRF tagger. The string is split into separate lines; each line is tokenized and expanded. Returns an array of sequence arrays that can be labelled by the CRF model.

If the string is marked as being tagged by passing true as the second argument, training labels will be extracted from the string and appended after feature expansion. The returned sequence arrays can be used for training or testing the CRF model.

    # File lib/anystyle/parser/parser.rb
140 def prepare(input, tagged = false)
141   string = input_to_s(input)
142   tokenize(string, tagged).map { |tk| tk.each_with_index.map { |(t,l),i| expand(t,tk,i,l) } }
143 end
reload() click to toggle source
   # File lib/anystyle/parser/parser.rb
65 def reload
66   @model = Wapiti.load(@options[:model])
67   @model.options.update_attributes @options
68   self
69 end
test(input) click to toggle source
    # File lib/anystyle/parser/parser.rb
175 def test(input)
176   model.options.check!
177   model.label(prepare(input, true))
178 end
tokenize(string, tagged = false) click to toggle source

Returns an array of tokens for each line of input.

If the passed-in string is marked as being tagged, extracts labels from the string and returns an array of token/label pairs for each line of input.

    # File lib/anystyle/parser/parser.rb
100 def tokenize(string, tagged = false)
101   if tagged
102     lines(string).each_with_index.map do |s,i|
103       tt, tokens, tags = s.split(options[:tagged_separator]), [], []
104 
105       tt.each do |token|
106         case token
107         when /^$/
108           # skip
109         when /^<([^\/>][^>]*)>$/
110           tags << $1
111         when /^<\/([^>]+)>$/
112           unless (tag = tags.pop) == $1
113             raise ArgumentError, "mismatched tags on line #{i}: #{$1.inspect} (current tag was #{tag.inspect})"
114           end
115         else
116           tokens << [decode_xml_text(token), (tags[-1] || :unknown).to_sym]
117         end
118       end
119 
120       tokens
121     end
122   else
123     lines(string).map { |s| s.split(options[:separator]).reject(&:empty?) }
124   end
125 end
train(input = options[:training_data], truncate = true) click to toggle source
    # File lib/anystyle/parser/parser.rb
155 def train(input = options[:training_data], truncate = true)
156   if truncate
157     @model = Wapiti::Model.new(options.reject { |k,_| k == :model })
158   end
159 
160   unless input.nil? || input.empty?
161     @model.train(prepare(input, true))
162   end
163 
164   @model.path = options[:model]
165   @model
166 end

Private Instance Methods

decode_xml_text(string) click to toggle source
    # File lib/anystyle/parser/parser.rb
275 def decode_xml_text(string)
276   string.gsub(/&(amp|gt|lt);/) do |match|
277     options[:xml_entities][match]
278   end
279 end
encode_xml_text(string) click to toggle source
    # File lib/anystyle/parser/parser.rb
281 def encode_xml_text(string)
282   string.encode string.encoding, :xml => :text
283 end
features_for(*arguments) click to toggle source
    # File lib/anystyle/parser/parser.rb
267 def features_for(*arguments)
268   Parser.features.map { |f| f.match(*arguments) }
269 end
format_bibtex(labels) click to toggle source
    # File lib/anystyle/parser/parser.rb
285 def format_bibtex(labels)
286   b = BibTeX::Bibliography.new
287   format_normalized(labels).each do |hash|
288     hash[:bibtex_type] = hash.delete :type
289 
290     hash[:type] = hash.delete :genre if hash.key?(:genre)
291     hash[:address] = hash.delete :location if hash.key?(:location)
292     hash[:urldate] = hash.delete :accessed if hash.key?(:accessed)
293 
294     if hash.key?(:authority)
295       if [:techreport,:thesis].include?(hash[:bibtex_type])
296         hash[:institution] = hash.delete :authority
297       else
298         hash[:organization] = hash.delete :authority
299       end
300     end
301 
302     b << BibTeX::Entry.new(hash)
303   end
304   b
305 end
format_citeproc(labels) click to toggle source
    # File lib/anystyle/parser/parser.rb
332 def format_citeproc(labels)
333   format_bibtex(labels).to_citeproc
334 end
format_hash(labels) click to toggle source
    # File lib/anystyle/parser/parser.rb
315 def format_hash(labels)
316   labels.map do |line|
317     line.inject({}) do |h, (label, token)|
318       if h.has_key?(label)
319         h[label] = [h[label]].flatten << token
320       else
321         h[label] = token
322       end
323       h
324     end
325   end
326 end
format_normalized(labels) click to toggle source
    # File lib/anystyle/parser/parser.rb
328 def format_normalized(labels)
329   format_hash(labels).map { |h| normalize h }
330 end
format_raw(labels) click to toggle source
    # File lib/anystyle/parser/parser.rb
307 def format_raw(labels)
308   labels.map do |line|
309     line.inject([]) do |tokens, (label, segment)|
310       tokens.concat segment.split(' ').map { |token| [label, token] }
311     end
312   end
313 end
format_tags(labels) click to toggle source
    # File lib/anystyle/parser/parser.rb
336 def format_tags(labels)
337   labels.map do |line|
338     line.map { |label, token| "<#{label}>#{encode_xml_text(token)}</#{label}>" }.join(' ')
339   end
340 end
format_xml(labels) click to toggle source
    # File lib/anystyle/parser/parser.rb
342 def format_xml(labels)
343   xml = Builder::XmlMarkup.new
344   xml.instruct! :xml, encoding: 'UTF-8'
345 
346   xml.references do |rs|
347     labels.each do |line|
348       rs.reference do |r|
349         line.each do |label, segment|
350           r.tag! label, segment
351         end
352       end
353     end
354   end
355 end
input_to_s(input) click to toggle source
    # File lib/anystyle/parser/parser.rb
249 def input_to_s(input)
250   case input
251   when String
252     if !input.tainted? && input.length < 128 && File.exists?(input)
253       f = File.open(input, 'r:UTF-8')
254       f.read
255     else
256       input
257     end
258   when Array
259     input.join("\n")
260   else
261     raise ArgumentError, "invalid input: #{input.class}"
262   end
263 ensure
264   f.close unless f.nil?
265 end
strip(token) click to toggle source
    # File lib/anystyle/parser/parser.rb
271 def strip(token)
272   token.gsub(options[:strip], '')
273 end