package lingscope.drivers;
import java.util.ArrayList; import java.util.List; import lingscope.io.AnnotatedSentencesIO; import lingscope.structures.AnnotatedSentence;
/**
* Merges two annotated files. Useful to merge a words scope file with a POS cue * file * @author shashank */
public class AnnotatedFilesMerger {
/** * Merges the given wordsSentence and the given tagsSentence * @param wordsSentence * @param tagsSentence * @return */ public static AnnotatedSentence merge(AnnotatedSentence wordsSentence, AnnotatedSentence tagsSentence) { List<String> words = wordsSentence.getWords(); List<String> tags = tagsSentence.getTags(); int numTokens = words.size(); if (tags.size() != numTokens) { System.err.println("Skipping non-equal length sentences"); System.err.println("\tSentence 1: " + wordsSentence.getRawText()); System.err.println("\tSentence 2: " + tagsSentence.getRawText()); return null; } StringBuilder mergedSentence = new StringBuilder(); for (int j = 0; j < numTokens; ++j) { mergedSentence.append(" ").append(words.get(j)).append("|").append(tags.get(j)); } return new AnnotatedSentence(mergedSentence.substring(1)); } /** * * @param args * 0 - file 1: the file from which words will be taken * 1 - file 2: the file from which tags will be taken * 2 - output file path */ public static void main(String[] args) { List<AnnotatedSentence> wordsSentences = AnnotatedSentencesIO.read(args[0]); List<AnnotatedSentence> tagsSentences = AnnotatedSentencesIO.read(args[1]); int numSentences = tagsSentences.size(); List<AnnotatedSentence> mergedSentences = new ArrayList<AnnotatedSentence>(numSentences); for (int i = 0; i < numSentences; ++i) { AnnotatedSentence wordsSentence = wordsSentences.get(i); AnnotatedSentence tagsSentence = tagsSentences.get(i); AnnotatedSentence mergedSentence = merge(wordsSentence, tagsSentence); if (mergedSentence == null) { continue; } mergedSentences.add(mergedSentence); } AnnotatedSentencesIO.write(args[2], mergedSentences); }
}