class Cab2xml::Converter
Attributes
attr_mode[RW]
token_format[RW]
Public Class Methods
new()
click to toggle source
# File lib/cab2xml/converter.rb, line 7 def initialize @attr_mode = :attr @token_format = :mecab_unidic end
Public Instance Methods
add_node(parent, format, data)
click to toggle source
# File lib/cab2xml/converter.rb, line 36 def add_node(parent, format, data) parent << (format % data) @last = parent.children.last return @last end
check_namespace(key)
click to toggle source
# File lib/cab2xml/converter.rb, line 41 def check_namespace(key) return unless key =~ /:/ namespace, key = key.split(':', 2) @namespaces ||= {} return if @namespaces[namespace] @namespaces[namespace] = true @doc.add_namespace namespace, 'http://www.ninjal.ac.jp/corpus_center/bccwj/' + namespace end
create_xml(mode)
click to toggle source
# File lib/cab2xml/converter.rb, line 29 def create_xml(mode) # mode = {:corpora|:document} @xml = Nokogiri::XML("<#{mode}/>") @xml.encoding = 'UTF-8' @corpora = @xml.root if mode == :corpora @doc = @xml.root if mode == :document end
parse(file)
click to toggle source
# File lib/cab2xml/converter.rb, line 11 def parse(file) @xml, @sen, @senid = nil file.set_encoding 'UTF-8' file.each_line do |line| line.chomp! case line when '' # ignore when /^##/ # comment line when /^#!/ parse_extended_tag line else parse_cabocha_tag line end end return @xml end
parse_cabocha_tag(line)
click to toggle source
# File lib/cab2xml/converter.rb, line 96 def parse_cabocha_tag(line) case line when /^\*/ create_xml(:document) unless @xml unless @sen @sen = add_node(@doc, '<sentence id="%d"/>', @senid ||= 0) @senid += 1 @tokid = 0 end null, id, dep, headfunc, score = line.split(' ') link, rel = dep[0..-2], dep[-1] head, func = headfunc.split('/') data = [id, link, rel, head, func, score] format = '<chunk id="%d" link="%d" rel="%s" head="%d" func="%d" score="%s"/>' @chunk = add_node(@sen, format, data) when 'EOS' @sen = nil else case token_format when :chasen data = line.split(/\s/) data = [@tokid, *data[1..5], data[0]] format = '<tok id="%d" read="%s" base="%s" pos="%s" cype="%s" cform="%s">%s</tok>' @tok = add_node(@chunk, format, data) @tokid += 1 when :mecab_unidic text, data = line.split(/\s/, 2) data = data.split(',').map{|item| item == '*' ? nil : item } pos = data[0, 4].compact.join('-') ctype, cform, lemmaForm, lemma = data[4, 4] data = [@tokid, pos, ctype, cform, lemmaForm, lemma, text] format = '<tok id="%d" pos="%s" cype="%s" cform="%s" lemmaForm="%s" lemma="%s">%s</tok>' @tok = add_node(@chunk, format, data) @tokid += 1 end end end
parse_extended_tag(line)
click to toggle source
# File lib/cab2xml/converter.rb, line 49 def parse_extended_tag(line) null, label, *data = CSV.parse_line(line, :col_sep => "\s") data.map!{|item| item.encode(:xml => :text)} case label when 'DOCID' create_xml(:corpora) unless @xml format = '<DOCID id=%d>%s</DOCID>' @docid = add_node(@corpora, format, data) when 'SENTENCETAGID' format = '<SENTENCETAGID id=%d>%s</SENTENCETAGID>' @sentencetagid = add_node(@corpora, format, data) when 'DOC' format = '<document id="%d"/>' @doc = add_node(@corpora, format, data) @senid = 0 when 'ATTR' case @attr_mode when :node format = '<ATTR Key="%s" Value="%s"/>' @attr = add_node(@last, format, data) when :attr key, value = data check_namespace key @last[key] = value end when 'SEGMENT' format = '<SEGMENT TagName="%s" StartGPos="%s" EndGPos="%s" Comments="%s"/>' @seg = add_node(@doc, format, data) when 'SEGMENT_S' format = '<SEGMENT_S TagName="%s" StartLPos="%s" EndLPos="%s" Comments="%s"/>' @seg = add_node(@sen, format, data) when 'LINK' format = '<LINK TagName="%s" FromSegNo="%s" EndSegNo="%s" Comments="%s"/>' @link = add_node(@doc, format, data) when 'LINK_S' format = '<LINK_S TagName="%s" FromSegSNo="%s" EndSegSNo="%s" Comments="%s"/>' @link = add_node(@sen, format, data) when 'GROUP' format = '<GROUP TagName="%s" SegNo="%s" Comments="%s"/>' data = [data[0], data[1..-2].join(','), data[-1]] @group = add_node(@doc, format, data) when 'GROUP_S' format = '<GROUP_S TagName="%s" SegSNo="%s" Comments="%s"/>' data = [data[0], data[1..-2].join(','), data[-1]] @group = add_node(@sen, format, data) end end